From 759cfef3ac4c07dba1ece0bbc1207e099348816d Mon Sep 17 00:00:00 2001 From: Alan Modra Date: Sat, 17 Aug 2013 18:47:22 +0930 Subject: PowerPC LE memcpy http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html LIttle-endian support for memcpy. I spent some time cleaning up the 64-bit power7 memcpy, in order to avoid the extra alignment traps power7 takes for little-endian. It probably would have been better to copy the linux kernel version of memcpy. * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support. * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise. * sysdeps/powerpc/powerpc64/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better use of regs. Use power7 mtocrf. Tidy function tails. --- sysdeps/powerpc/powerpc32/power4/memcpy.S | 58 +++ sysdeps/powerpc/powerpc32/power6/memcpy.S | 81 +++- sysdeps/powerpc/powerpc32/power7/memcpy.S | 24 +- sysdeps/powerpc/powerpc32/power7/mempcpy.S | 28 +- sysdeps/powerpc/powerpc64/memcpy.S | 27 ++ sysdeps/powerpc/powerpc64/power4/memcpy.S | 61 ++- sysdeps/powerpc/powerpc64/power6/memcpy.S | 329 ++++++++++++++ sysdeps/powerpc/powerpc64/power7/memcpy.S | 704 +++++++++++++---------------- sysdeps/powerpc/powerpc64/power7/mempcpy.S | 26 +- 9 files changed, 928 insertions(+), 410 deletions(-) (limited to 'sysdeps/powerpc') diff --git a/sysdeps/powerpc/powerpc32/power4/memcpy.S b/sysdeps/powerpc/powerpc32/power4/memcpy.S index d9146631e3..338d3cce30 100644 --- a/sysdeps/powerpc/powerpc32/power4/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power4/memcpy.S @@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0) blt cr6,5f srwi 7,6,16 bgt cr6,3f +#ifdef __LITTLE_ENDIAN__ + sth 7,0(3) +#else sth 6,0(3) +#endif b 7f .align 4 3: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,24 + stb 6,0(3) + sth 7,1(3) +#else stb 7,0(3) sth 6,1(3) +#endif b 7f .align 4 5: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,8 +#endif stb 6,0(3) 7: cmplwi cr1,10,16 @@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0) bf 30,1f /* there are at least two words to copy, so copy them */ +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 /* shift 1st src word to left align it in R0 */ srw 8,7,9 /* shift 2nd src word to right align it in R8 */ +#endif or 0,0,8 /* or them to get word to store */ lwz 6,8(5) /* load the 3rd src word */ stw 0,0(4) /* store the 1st dst word */ +#ifdef __LITTLE_ENDIAN__ + srw 0,7,10 + slw 8,6,9 +#else slw 0,7,10 /* now left align 2nd src word into R0 */ srw 8,6,9 /* shift 3rd src word to right align it in R8 */ +#endif or 0,0,8 /* or them to get word to store */ lwz 7,12(5) stw 0,4(4) /* store the 2nd dst word */ @@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0) addi 5,5,16 bf 31,4f /* there is a third word to copy, so copy it */ +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 /* shift 3rd src word to left align it in R0 */ srw 8,7,9 /* shift 4th src word to right align it in R8 */ +#endif or 0,0,8 /* or them to get word to store */ stw 0,0(4) /* store 3rd dst word */ mr 6,7 @@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0) b 4f .align 4 1: +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 /* shift 1st src word to left align it in R0 */ srw 8,7,9 /* shift 2nd src word to right align it in R8 */ +#endif addi 5,5,8 or 0,0,8 /* or them to get word to store */ bf 31,4f @@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0) .align 4 4: /* copy 16 bytes at a time */ +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 srw 8,7,9 +#endif or 0,0,8 lwz 6,0(5) stw 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srw 0,7,10 + slw 8,6,9 +#else slw 0,7,10 srw 8,6,9 +#endif or 0,0,8 lwz 7,4(5) stw 0,4(4) +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 srw 8,7,9 +#endif or 0,0,8 lwz 6,8(5) stw 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srw 0,7,10 + slw 8,6,9 +#else slw 0,7,10 srw 8,6,9 +#endif or 0,0,8 lwz 7,12(5) stw 0,12(4) @@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0) bdnz+ 4b 8: /* calculate and store the final word */ +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 srw 8,7,9 +#endif or 0,0,8 stw 0,0(4) 3: diff --git a/sysdeps/powerpc/powerpc32/power6/memcpy.S b/sysdeps/powerpc/powerpc32/power6/memcpy.S index a76f71e04f..f58114a0c5 100644 --- a/sysdeps/powerpc/powerpc32/power6/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S @@ -219,15 +219,28 @@ L(word_unaligned_short): blt cr6,5f srwi 7,6,16 bgt cr6,3f +#ifdef __LITTLE_ENDIAN__ + sth 7,0(3) +#else sth 6,0(3) +#endif b 7f .align 4 3: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,24 + stb 6,0(3) + sth 7,1(3) +#else stb 7,0(3) sth 6,1(3) +#endif b 7f .align 4 5: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,8 +#endif stb 6,0(3) 7: cmplwi cr1,10,16 @@ -577,7 +590,11 @@ L(wdu1_32): lwz 6,-1(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ +#ifdef __LITTLE_ENDIAN__ + srwi 6,6,8 +#else slwi 6,6,8 +#endif clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu1_32tail) mtctr 8 @@ -585,8 +602,12 @@ L(wdu1_32): lwz 8,3(4) lwz 7,4(4) +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,24,32 +#else /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,8,(32-8),31 +#endif b L(wdu1_loop32x) .align 4 L(wdu1_loop32): @@ -595,8 +616,12 @@ L(wdu1_loop32): lwz 7,4(4) stw 10,-8(3) stw 11,-4(3) +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,24,32 +#else /* Equivalent to srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,8,(32-8),31 +#endif L(wdu1_loop32x): lwz 10,8(4) lwz 11,12(4) @@ -613,7 +638,11 @@ L(wdu1_loop32x): stw 6,16(3) stw 7,20(3) addi 3,3,32 +#ifdef __LITTLE_ENDIAN__ + srwi 6,8,8 +#else slwi 6,8,8 +#endif bdnz+ L(wdu1_loop32) stw 10,-8(3) stw 11,-4(3) @@ -624,8 +653,12 @@ L(wdu1_32tail): blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,3(4) -/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,24,32 +#else +/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,8,(32-8),31 +#endif b L(wdu_32tailx) L(wdu2_32): @@ -633,7 +666,11 @@ L(wdu2_32): lwz 6,-2(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ +#ifdef __LITTLE_ENDIAN__ + srwi 6,6,16 +#else slwi 6,6,16 +#endif clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu2_32tail) mtctr 8 @@ -641,8 +678,11 @@ L(wdu2_32): lwz 8,2(4) lwz 7,4(4) -/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,16,32 +#else rlwimi 6,8,16,(32-16),31 +#endif b L(wdu2_loop32x) .align 4 L(wdu2_loop32): @@ -651,8 +691,11 @@ L(wdu2_loop32): lwz 7,4(4) stw 10,-8(3) stw 11,-4(3) -/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,16,32 +#else rlwimi 6,8,16,(32-16),31 +#endif L(wdu2_loop32x): lwz 10,8(4) lwz 11,12(4) @@ -670,7 +713,11 @@ L(wdu2_loop32x): stw 6,16(3) stw 7,20(3) addi 3,3,32 +#ifdef __LITTLE_ENDIAN__ + srwi 6,8,16 +#else slwi 6,8,16 +#endif bdnz+ L(wdu2_loop32) stw 10,-8(3) stw 11,-4(3) @@ -681,8 +728,11 @@ L(wdu2_32tail): blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,2(4) -/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,16,32 +#else rlwimi 6,8,16,(32-16),31 +#endif b L(wdu_32tailx) L(wdu3_32): @@ -690,7 +740,11 @@ L(wdu3_32): lwz 6,-3(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ +#ifdef __LITTLE_ENDIAN__ + srwi 6,6,24 +#else slwi 6,6,24 +#endif clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu3_32tail) mtctr 8 @@ -698,8 +752,11 @@ L(wdu3_32): lwz 8,1(4) lwz 7,4(4) -/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,8,32 +#else rlwimi 6,8,24,(32-24),31 +#endif b L(wdu3_loop32x) .align 4 L(wdu3_loop32): @@ -708,8 +765,11 @@ L(wdu3_loop32): lwz 7,4(4) stw 10,-8(3) stw 11,-4(3) -/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,8,32 +#else rlwimi 6,8,24,(32-24),31 +#endif L(wdu3_loop32x): lwz 10,8(4) lwz 11,12(4) @@ -726,7 +786,11 @@ L(wdu3_loop32x): stw 6,16(3) stw 7,20(3) addi 3,3,32 +#ifdef __LITTLE_ENDIAN__ + srwi 6,8,24 +#else slwi 6,8,24 +#endif bdnz+ L(wdu3_loop32) stw 10,-8(3) stw 11,-4(3) @@ -737,8 +801,11 @@ L(wdu3_32tail): blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,1(4) -/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,8,32 +#else rlwimi 6,8,24,(32-24),31 +#endif b L(wdu_32tailx) .align 4 L(wdu_32tailx): diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S index 7f00778236..acf3c10198 100644 --- a/sysdeps/powerpc/powerpc32/power7/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S @@ -383,7 +383,7 @@ L(copy_GE_32_unaligned): beq L(copy_GE_32_unaligned_cont) - /* SRC is not quadword aligned, get it aligned. */ + /* DST is not quadword aligned, get it aligned. */ mtcrf 0x01,0 subf 31,0,5 @@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont): mr 11,12 mtcrf 0x01,9 cmplwi cr6,9,1 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,12 +#else lvsl 5,0,12 +#endif lvx 3,0,12 bf 31,L(setup_unaligned_loop) /* Copy another 16 bytes to align to 32-bytes due to the loop . */ lvx 4,12,6 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else vperm 6,3,4,5 +#endif addi 11,12,16 addi 10,3,16 stvx 6,0,3 @@ -461,11 +469,17 @@ L(unaligned_loop): vector instructions though. */ lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif addi 11,11,32 stvx 6,0,10 stvx 10,10,6 diff --git a/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/sysdeps/powerpc/powerpc32/power7/mempcpy.S index 5ad4edb580..4610ec5b56 100644 --- a/sysdeps/powerpc/powerpc32/power7/mempcpy.S +++ b/sysdeps/powerpc/powerpc32/power7/mempcpy.S @@ -325,7 +325,7 @@ L(copy_GE_32_unaligned): beq L(copy_GE_32_unaligned_cont) - /* SRC is not quadword aligned, get it aligned. */ + /* DST is not quadword aligned, get it aligned. */ mtcrf 0x01,0 subf 31,0,5 @@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont): mr 11,12 mtcrf 0x01,9 cmplwi cr6,9,1 - lvsl 5,0,12 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,12 +#else + lvsl 5,0,12 +#endif lvx 3,0,12 bf 31,L(setup_unaligned_loop) /* Copy another 16 bytes to align to 32-bytes due to the loop . */ lvx 4,12,6 - vperm 6,3,4,5 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif addi 11,12,16 addi 10,3,16 stvx 6,0,3 @@ -403,11 +411,17 @@ L(unaligned_loop): vector instructions though. */ lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif addi 11,11,32 stvx 6,0,10 stvx 10,10,6 diff --git a/sysdeps/powerpc/powerpc64/memcpy.S b/sysdeps/powerpc/powerpc64/memcpy.S index b8c4cc8b10..5fc7401c99 100644 --- a/sysdeps/powerpc/powerpc64/memcpy.S +++ b/sysdeps/powerpc/powerpc64/memcpy.S @@ -212,15 +212,28 @@ EALIGN (memcpy, 5, 0) blt cr6,5f srdi 7,6,16 bgt cr6,3f +#ifdef __LITTLE_ENDIAN__ + sth 7,0(3) +#else sth 6,0(3) +#endif b 7f .align 4 3: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,24 + stb 6,0(3) + sth 7,1(3) +#else stb 7,0(3) sth 6,1(3) +#endif b 7f .align 4 5: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,8 +#endif stb 6,0(3) 7: cmpldi cr1,10,16 @@ -328,7 +341,11 @@ EALIGN (memcpy, 5, 0) ld 7,8(5) subfic 9,10,64 beq 2f +#ifdef __LITTLE_ENDIAN__ + srd 0,6,10 +#else sld 0,6,10 +#endif cmpldi 11,1 mr 6,7 addi 4,4,-8 @@ -336,15 +353,25 @@ EALIGN (memcpy, 5, 0) b 1f 2: addi 5,5,8 .align 4 +#ifdef __LITTLE_ENDIAN__ +0: srd 0,6,10 + sld 8,7,9 +#else 0: sld 0,6,10 srd 8,7,9 +#endif cmpldi 11,2 ld 6,8(5) or 0,0,8 addi 11,11,-2 std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srd 0,7,10 +1: sld 8,6,9 +#else sld 0,7,10 1: srd 8,6,9 +#endif or 0,0,8 beq 8f ld 7,16(5) diff --git a/sysdeps/powerpc/powerpc64/power4/memcpy.S b/sysdeps/powerpc/powerpc64/power4/memcpy.S index 4317c7e786..f9a7260dcb 100644 --- a/sysdeps/powerpc/powerpc64/power4/memcpy.S +++ b/sysdeps/powerpc/powerpc64/power4/memcpy.S @@ -214,15 +214,28 @@ EALIGN (memcpy, 5, 0) blt cr6,5f srdi 7,6,16 bgt cr6,3f +#ifdef __LITTLE_ENDIAN__ + sth 7,0(3) +#else sth 6,0(3) +#endif b 7f .align 4 3: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,24 + stb 6,0(3) + sth 7,1(3) +#else stb 7,0(3) sth 6,1(3) +#endif b 7f .align 4 5: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,8 +#endif stb 6,0(3) 7: cmpldi cr1,10,16 @@ -334,13 +347,23 @@ EALIGN (memcpy, 5, 0) bf 30,1f /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srd 0,6,10 + sld 8,7,9 +#else sld 0,6,10 srd 8,7,9 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srd 0,7,10 + sld 8,6,9 +#else sld 0,7,10 srd 8,6,9 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -349,8 +372,13 @@ EALIGN (memcpy, 5, 0) blt cr6,8f /* if total DWs = 3, then bypass loop */ bf 31,4f /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srd 0,6,10 + sld 8,7,9 +#else sld 0,6,10 srd 8,7,9 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -361,8 +389,13 @@ EALIGN (memcpy, 5, 0) b 4f .align 4 1: +#ifdef __LITTLE_ENDIAN__ + srd 0,6,10 + sld 8,7,9 +#else sld 0,6,10 srd 8,7,9 +#endif addi 5,5,16 or 0,0,8 bf 31,4f @@ -373,23 +406,44 @@ EALIGN (memcpy, 5, 0) addi 4,4,8 .align 4 /* copy 32 bytes at a time */ -4: sld 0,6,10 +4: +#ifdef __LITTLE_ENDIAN__ + srd 0,6,10 + sld 8,7,9 +#else + sld 0,6,10 srd 8,7,9 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srd 0,7,10 + sld 8,6,9 +#else sld 0,7,10 srd 8,6,9 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srd 0,6,10 + sld 8,7,9 +#else sld 0,6,10 srd 8,7,9 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srd 0,7,10 + sld 8,6,9 +#else sld 0,7,10 srd 8,6,9 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -399,8 +453,13 @@ EALIGN (memcpy, 5, 0) .align 4 8: /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srd 0,6,10 + sld 8,7,9 +#else sld 0,6,10 srd 8,7,9 +#endif or 0,0,8 std 0,0(4) 3: diff --git a/sysdeps/powerpc/powerpc64/power6/memcpy.S b/sysdeps/powerpc/powerpc64/power6/memcpy.S index d6d242d293..e3f3d8a303 100644 --- a/sysdeps/powerpc/powerpc64/power6/memcpy.S +++ b/sysdeps/powerpc/powerpc64/power6/memcpy.S @@ -400,15 +400,28 @@ L(das_tail2): blt cr6,5f srdi 7,6,16 bgt cr6,3f +#ifdef __LITTLE_ENDIAN__ + sth 7,0(3) +#else sth 6,0(3) +#endif b 7f .align 4 3: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,24 + stb 6,0(3) + sth 7,1(3) +#else stb 7,0(3) sth 6,1(3) +#endif b 7f .align 4 5: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,8 +#endif stb 6,0(3) 7: cmpldi cr1,10,16 @@ -595,13 +608,24 @@ L(du1_do): bf 30,L(du1_1dw) /* there are at least two DWs to copy */ + /* FIXME: can combine last shift and "or" into "rldimi" */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 8 + sldi 8,6, 64-8 +#else sldi 0,7, 8 srdi 8,6, 64-8 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -610,8 +634,13 @@ L(du1_do): blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du1_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -622,8 +651,13 @@ L(du1_do): b L(du1_loop) .align 4 L(du1_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du1_loop) @@ -635,23 +669,43 @@ L(du1_1dw): .align 4 /* copy 32 bytes at a time */ L(du1_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 8 + sldi 8,6, 64-8 +#else sldi 0,7, 8 srdi 8,6, 64-8 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 8 + sldi 8,6, 64-8 +#else sldi 0,7, 8 srdi 8,6, 64-8 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -661,8 +715,13 @@ L(du1_loop): .align 4 L(du1_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 8 + sldi 8,7, 64-8 +#else sldi 0,6, 8 srdi 8,7, 64-8 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -672,13 +731,23 @@ L(du2_do): bf 30,L(du2_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 16 + sldi 8,6, 64-16 +#else sldi 0,7, 16 srdi 8,6, 64-16 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -687,8 +756,13 @@ L(du2_do): blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du2_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -699,8 +773,13 @@ L(du2_do): b L(du2_loop) .align 4 L(du2_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du2_loop) @@ -712,23 +791,43 @@ L(du2_1dw): .align 4 /* copy 32 bytes at a time */ L(du2_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 16 + sldi 8,6, 64-16 +#else sldi 0,7, 16 srdi 8,6, 64-16 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 16 + sldi 8,6, 64-16 +#else sldi 0,7, 16 srdi 8,6, 64-16 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -738,8 +837,13 @@ L(du2_loop): .align 4 L(du2_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 16 + sldi 8,7, 64-16 +#else sldi 0,6, 16 srdi 8,7, 64-16 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -749,13 +853,23 @@ L(du3_do): bf 30,L(du3_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 24 + sldi 8,6, 64-24 +#else sldi 0,7, 24 srdi 8,6, 64-24 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -764,8 +878,13 @@ L(du3_do): blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du3_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -776,8 +895,13 @@ L(du3_do): b L(du3_loop) .align 4 L(du3_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du3_loop) @@ -789,23 +913,43 @@ L(du3_1dw): .align 4 /* copy 32 bytes at a time */ L(du3_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 24 + sldi 8,6, 64-24 +#else sldi 0,7, 24 srdi 8,6, 64-24 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 24 + sldi 8,6, 64-24 +#else sldi 0,7, 24 srdi 8,6, 64-24 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -815,8 +959,13 @@ L(du3_loop): .align 4 L(du3_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 24 + sldi 8,7, 64-24 +#else sldi 0,6, 24 srdi 8,7, 64-24 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -832,13 +981,23 @@ L(du4_dox): bf 30,L(du4_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 32 + sldi 8,6, 64-32 +#else sldi 0,7, 32 srdi 8,6, 64-32 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -847,8 +1006,13 @@ L(du4_dox): blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du4_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -859,8 +1023,13 @@ L(du4_dox): b L(du4_loop) .align 4 L(du4_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du4_loop) @@ -872,23 +1041,43 @@ L(du4_1dw): .align 4 /* copy 32 bytes at a time */ L(du4_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 32 + sldi 8,6, 64-32 +#else sldi 0,7, 32 srdi 8,6, 64-32 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 32 + sldi 8,6, 64-32 +#else sldi 0,7, 32 srdi 8,6, 64-32 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -898,8 +1087,13 @@ L(du4_loop): .align 4 L(du4_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 32 + sldi 8,7, 64-32 +#else sldi 0,6, 32 srdi 8,7, 64-32 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -909,13 +1103,23 @@ L(du5_do): bf 30,L(du5_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 40 + sldi 8,6, 64-40 +#else sldi 0,7, 40 srdi 8,6, 64-40 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -924,8 +1128,13 @@ L(du5_do): blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du5_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -936,8 +1145,13 @@ L(du5_do): b L(du5_loop) .align 4 L(du5_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du5_loop) @@ -949,23 +1163,43 @@ L(du5_1dw): .align 4 /* copy 32 bytes at a time */ L(du5_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 40 + sldi 8,6, 64-40 +#else sldi 0,7, 40 srdi 8,6, 64-40 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 40 + sldi 8,6, 64-40 +#else sldi 0,7, 40 srdi 8,6, 64-40 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -975,8 +1209,13 @@ L(du5_loop): .align 4 L(du5_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 40 + sldi 8,7, 64-40 +#else sldi 0,6, 40 srdi 8,7, 64-40 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -986,13 +1225,23 @@ L(du6_do): bf 30,L(du6_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 48 + sldi 8,6, 64-48 +#else sldi 0,7, 48 srdi 8,6, 64-48 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -1001,8 +1250,13 @@ L(du6_do): blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du6_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -1013,8 +1267,13 @@ L(du6_do): b L(du6_loop) .align 4 L(du6_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du6_loop) @@ -1026,23 +1285,43 @@ L(du6_1dw): .align 4 /* copy 32 bytes at a time */ L(du6_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 48 + sldi 8,6, 64-48 +#else sldi 0,7, 48 srdi 8,6, 64-48 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 48 + sldi 8,6, 64-48 +#else sldi 0,7, 48 srdi 8,6, 64-48 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -1052,8 +1331,13 @@ L(du6_loop): .align 4 L(du6_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 48 + sldi 8,7, 64-48 +#else sldi 0,6, 48 srdi 8,7, 64-48 +#endif or 0,0,8 std 0,0(4) b L(du_done) @@ -1063,13 +1347,23 @@ L(du7_do): bf 30,L(du7_1dw) /* there are at least two DWs to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 ld 6,16(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 56 + sldi 8,6, 64-56 +#else sldi 0,7, 56 srdi 8,6, 64-56 +#endif or 0,0,8 ld 7,24(5) std 0,8(4) @@ -1078,8 +1372,13 @@ L(du7_do): blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */ bf 31,L(du7_loop) /* there is a third DW to copy */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 std 0,0(4) mr 6,7 @@ -1090,8 +1389,13 @@ L(du7_do): b L(du7_loop) .align 4 L(du7_1dw): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif addi 5,5,16 or 0,0,8 bf 31,L(du7_loop) @@ -1103,23 +1407,43 @@ L(du7_1dw): .align 4 /* copy 32 bytes at a time */ L(du7_loop): +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 ld 6,0(5) std 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 56 + sldi 8,6, 64-56 +#else sldi 0,7, 56 srdi 8,6, 64-56 +#endif or 0,0,8 ld 7,8(5) std 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 ld 6,16(5) std 0,16(4) +#ifdef __LITTLE_ENDIAN__ + srdi 0,7, 56 + sldi 8,6, 64-56 +#else sldi 0,7, 56 srdi 8,6, 64-56 +#endif or 0,0,8 ld 7,24(5) std 0,24(4) @@ -1129,8 +1453,13 @@ L(du7_loop): .align 4 L(du7_fini): /* calculate and store the final DW */ +#ifdef __LITTLE_ENDIAN__ + srdi 0,6, 56 + sldi 8,7, 64-56 +#else sldi 0,6, 56 srdi 8,7, 64-56 +#endif or 0,0,8 std 0,0(4) b L(du_done) diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S index 800a9f1bb1..e8df75f593 100644 --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S @@ -23,418 +23,361 @@ /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); Returns 'dst'. */ +#define dst 11 /* Use r11 so r3 kept unchanged. */ +#define src 4 +#define cnt 5 + .machine power7 EALIGN (memcpy, 5, 0) CALL_MCOUNT 3 - cmpldi cr1,5,31 + cmpldi cr1,cnt,31 neg 0,3 - std 3,-16(1) - std 31,-8(1) - cfi_offset(31,-8) ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move code. */ - andi. 11,3,7 /* Check alignment of DST. */ - - - clrldi 10,4,61 /* Check alignment of SRC. */ - cmpld cr6,10,11 /* SRC and DST alignments match? */ - mr 12,4 - mr 31,5 +#ifdef __LITTLE_ENDIAN__ +/* In little-endian mode, power7 takes an alignment trap on any lxvd2x + or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy + loop is only used for quadword aligned copies. */ + andi. 10,3,15 + clrldi 11,4,60 +#else + andi. 10,3,7 /* Check alignment of DST. */ + clrldi 11,4,61 /* Check alignment of SRC. */ +#endif + cmpld cr6,10,11 /* SRC and DST alignments match? */ + + mr dst,3 bne cr6,L(copy_GE_32_unaligned) + beq L(aligned_copy) - srdi 9,5,3 /* Number of full quadwords remaining. */ - - beq L(copy_GE_32_aligned_cont) - - clrldi 0,0,61 - mtcrf 0x01,0 - subf 31,0,5 - - /* Get the SRC aligned to 8 bytes. */ - -1: bf 31,2f - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 -2: bf 30,4f - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -4: bf 29,0f - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -0: - clrldi 10,12,61 /* Check alignment of SRC again. */ - srdi 9,31,3 /* Number of full doublewords remaining. */ - -L(copy_GE_32_aligned_cont): - - clrldi 11,31,61 - mtcrf 0x01,9 - - srdi 8,31,5 - cmpldi cr1,9,4 - cmpldi cr6,11,0 - mr 11,12 - - /* Copy 1~3 doublewords so the main loop starts - at a multiple of 32 bytes. */ + mtocrf 0x01,0 +#ifdef __LITTLE_ENDIAN__ + clrldi 0,0,60 +#else + clrldi 0,0,61 +#endif - bf 30,1f - ld 6,0(12) - ld 7,8(12) - addi 11,12,16 - mtctr 8 - std 6,0(3) - std 7,8(3) - addi 10,3,16 - bf 31,4f - ld 0,16(12) - std 0,16(3) - blt cr1,3f - addi 11,12,24 - addi 10,3,24 - b 4f - - .align 4 -1: /* Copy 1 doubleword and set the counter. */ - mr 10,3 - mtctr 8 - bf 31,4f - ld 6,0(12) - addi 11,12,8 - std 6,0(3) - addi 10,3,8 - -L(aligned_copy): - /* Main aligned copy loop. Copies up to 128-bytes at a time. */ - .align 4 +/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ +1: + bf 31,2f + lbz 6,0(src) + addi src,src,1 + stb 6,0(dst) + addi dst,dst,1 +2: + bf 30,4f + lhz 6,0(src) + addi src,src,2 + sth 6,0(dst) + addi dst,dst,2 4: - /* check for any 32-byte or 64-byte lumps that are outside of a - nice 128-byte range. R8 contains the number of 32-byte - lumps, so drop this into the CR, and use the SO/EQ bits to help - handle the 32- or 64- byte lumps. Then handle the rest with an - unrolled 128-bytes-at-a-time copy loop. */ - mtocrf 1,8 - li 6,16 # 16() index - li 7,32 # 32() index - li 8,48 # 48() index - -L(aligned_32byte): - /* if the SO bit (indicating a 32-byte lump) is not set, move along. */ - bns cr7,L(aligned_64byte) - lxvd2x 6,0,11 - lxvd2x 7,11,6 - addi 11,11,32 - stxvd2x 6,0,10 - stxvd2x 7,10,6 - addi 10,10,32 - -L(aligned_64byte): - /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */ - bne cr7,L(aligned_128setup) - lxvd2x 6,0,11 - lxvd2x 7,11,6 - lxvd2x 8,11,7 - lxvd2x 9,11,8 - addi 11,11,64 - stxvd2x 6,0,10 - stxvd2x 7,10,6 - stxvd2x 8,10,7 - stxvd2x 9,10,8 - addi 10,10,64 - -L(aligned_128setup): - /* Set up for the 128-byte at a time copy loop. */ - srdi 8,31,7 - cmpdi 8,0 # Any 4x lumps left? - beq 3f # if not, move along. - lxvd2x 6,0,11 - lxvd2x 7,11,6 - mtctr 8 # otherwise, load the ctr and begin. - li 8,48 # 48() index + bf 29,8f + lwz 6,0(src) + addi src,src,4 + stw 6,0(dst) + addi dst,dst,4 +8: +#ifdef __LITTLE_ENDIAN__ + bf 28,16f + ld 6,0(src) + addi src,src,8 + std 6,0(dst) + addi dst,dst,8 +16: +#endif + subf cnt,0,cnt + +/* Main aligned copy loop. Copies 128 bytes at a time. */ +L(aligned_copy): + li 6,16 + li 7,32 + li 8,48 + mtocrf 0x02,cnt + srdi 12,cnt,7 + cmpdi 12,0 + beq L(aligned_tail) + lxvd2x 6,0,src + lxvd2x 7,src,6 + mtctr 12 b L(aligned_128loop) + .align 4 L(aligned_128head): /* for the 2nd + iteration of this loop. */ - lxvd2x 6,0,11 - lxvd2x 7,11,6 + lxvd2x 6,0,src + lxvd2x 7,src,6 L(aligned_128loop): - lxvd2x 8,11,7 - lxvd2x 9,11,8 - stxvd2x 6,0,10 - addi 11,11,64 - stxvd2x 7,10,6 - stxvd2x 8,10,7 - stxvd2x 9,10,8 - lxvd2x 6,0,11 - lxvd2x 7,11,6 - addi 10,10,64 - lxvd2x 8,11,7 - lxvd2x 9,11,8 - addi 11,11,64 - stxvd2x 6,0,10 - stxvd2x 7,10,6 - stxvd2x 8,10,7 - stxvd2x 9,10,8 - addi 10,10,64 + lxvd2x 8,src,7 + lxvd2x 9,src,8 + stxvd2x 6,0,dst + addi src,src,64 + stxvd2x 7,dst,6 + stxvd2x 8,dst,7 + stxvd2x 9,dst,8 + lxvd2x 6,0,src + lxvd2x 7,src,6 + addi dst,dst,64 + lxvd2x 8,src,7 + lxvd2x 9,src,8 + addi src,src,64 + stxvd2x 6,0,dst + stxvd2x 7,dst,6 + stxvd2x 8,dst,7 + stxvd2x 9,dst,8 + addi dst,dst,64 bdnz L(aligned_128head) -3: - /* Check for tail bytes. */ - rldicr 0,31,0,60 - mtcrf 0x01,31 - beq cr6,0f - -.L9: - add 3,3,0 - add 12,12,0 - - /* At this point we have a tail of 0-7 bytes and we know that the - destination is doubleword-aligned. */ -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2 bytes. */ - bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,0(12) - stb 6,0(3) -0: /* Return original DST pointer. */ - ld 31,-8(1) - ld 3,-16(1) +L(aligned_tail): + mtocrf 0x01,cnt + bf 25,32f + lxvd2x 6,0,src + lxvd2x 7,src,6 + lxvd2x 8,src,7 + lxvd2x 9,src,8 + addi src,src,64 + stxvd2x 6,0,dst + stxvd2x 7,dst,6 + stxvd2x 8,dst,7 + stxvd2x 9,dst,8 + addi dst,dst,64 +32: + bf 26,16f + lxvd2x 6,0,src + lxvd2x 7,src,6 + addi src,src,32 + stxvd2x 6,0,dst + stxvd2x 7,dst,6 + addi dst,dst,32 +16: + bf 27,8f + lxvd2x 6,0,src + addi src,src,16 + stxvd2x 6,0,dst + addi dst,dst,16 +8: + bf 28,4f + ld 6,0(src) + addi src,src,8 + std 6,0(dst) + addi dst,dst,8 +4: /* Copies 4~7 bytes. */ + bf 29,L(tail2) + lwz 6,0(src) + stw 6,0(dst) + bf 30,L(tail5) + lhz 7,4(src) + sth 7,4(dst) + bflr 31 + lbz 8,6(src) + stb 8,6(dst) + /* Return original DST pointer. */ blr - /* Handle copies of 0~31 bytes. */ - .align 4 + +/* Handle copies of 0~31 bytes. */ + .align 4 L(copy_LT_32): - cmpldi cr6,5,8 - mr 12,4 - mtcrf 0x01,5 + mr dst,3 + cmpldi cr6,cnt,8 + mtocrf 0x01,cnt ble cr6,L(copy_LE_8) /* At least 9 bytes to go. */ neg 8,4 - clrrdi 11,4,2 - andi. 0,8,3 - cmpldi cr1,5,16 - mr 10,5 + andi. 0,8,3 + cmpldi cr1,cnt,16 beq L(copy_LT_32_aligned) - /* Force 4-bytes alignment for SRC. */ - mtocrf 0x01,0 - subf 10,0,5 -2: bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: bf 31,L(end_4bytes_alignment) - - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 - - .align 4 + /* Force 4-byte alignment for SRC. */ + mtocrf 0x01,0 + subf cnt,0,cnt +2: + bf 30,1f + lhz 6,0(src) + addi src,src,2 + sth 6,0(dst) + addi dst,dst,2 +1: + bf 31,L(end_4bytes_alignment) + lbz 6,0(src) + addi src,src,1 + stb 6,0(dst) + addi dst,dst,1 + + .align 4 L(end_4bytes_alignment): - cmpldi cr1,10,16 - mtcrf 0x01,10 + cmpldi cr1,cnt,16 + mtocrf 0x01,cnt L(copy_LT_32_aligned): /* At least 6 bytes to go, and SRC is word-aligned. */ blt cr1,8f /* Copy 16 bytes. */ - lwz 6,0(12) - lwz 7,4(12) - stw 6,0(3) - lwz 8,8(12) - stw 7,4(3) - lwz 6,12(12) - addi 12,12,16 - stw 8,8(3) - stw 6,12(3) - addi 3,3,16 + lwz 6,0(src) + lwz 7,4(src) + stw 6,0(dst) + lwz 8,8(src) + stw 7,4(dst) + lwz 6,12(src) + addi src,src,16 + stw 8,8(dst) + stw 6,12(dst) + addi dst,dst,16 8: /* Copy 8 bytes. */ - bf 28,4f + bf 28,L(tail4) + lwz 6,0(src) + lwz 7,4(src) + addi src,src,8 + stw 6,0(dst) + stw 7,4(dst) + addi dst,dst,8 + + .align 4 +/* Copies 4~7 bytes. */ +L(tail4): + bf 29,L(tail2) + lwz 6,0(src) + stw 6,0(dst) + bf 30,L(tail5) + lhz 7,4(src) + sth 7,4(dst) + bflr 31 + lbz 8,6(src) + stb 8,6(dst) + /* Return original DST pointer. */ + blr - lwz 6,0(12) - lwz 7,4(12) - addi 12,12,8 - stw 6,0(3) - stw 7,4(3) - addi 3,3,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2-3 bytes. */ + .align 4 +/* Copies 2~3 bytes. */ +L(tail2): bf 30,1f - - lhz 6,0(12) - sth 6,0(3) - bf 31,0f - lbz 7,2(12) - stb 7,2(3) - ld 3,-16(1) + lhz 6,0(src) + sth 6,0(dst) + bflr 31 + lbz 7,2(src) + stb 7,2(dst) blr - .align 4 -1: /* Copy 1 byte. */ - bf 31,0f + .align 4 +L(tail5): + bflr 31 + lbz 6,4(src) + stb 6,4(dst) + blr - lbz 6,0(12) - stb 6,0(3) -0: /* Return original DST pointer. */ - ld 3,-16(1) + .align 4 +1: + bflr 31 + lbz 6,0(src) + stb 6,0(dst) + /* Return original DST pointer. */ blr - /* Handles copies of 0~8 bytes. */ - .align 4 + +/* Handles copies of 0~8 bytes. */ + .align 4 L(copy_LE_8): - bne cr6,4f + bne cr6,L(tail4) /* Though we could've used ld/std here, they are still slow for unaligned cases. */ - lwz 6,0(4) - lwz 7,4(4) - stw 6,0(3) - stw 7,4(3) - ld 3,-16(1) /* Return original DST pointers. */ + lwz 6,0(src) + lwz 7,4(src) + stw 6,0(dst) + stw 7,4(dst) blr - .align 4 -4: /* Copies 4~7 bytes. */ - bf 29,2b - - lwz 6,0(4) - stw 6,0(3) - bf 30,5f - lhz 7,4(4) - sth 7,4(3) - bf 31,0f - lbz 8,6(4) - stb 8,6(3) - ld 3,-16(1) - blr - - .align 4 -5: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,4(4) - stb 6,4(3) - -0: /* Return original DST pointer. */ - ld 3,-16(1) - blr - /* Handle copies of 32+ bytes where DST is aligned (to quadword) but - SRC is not. Use aligned quadword loads from SRC, shifted to realign - the data, allowing for aligned DST stores. */ - .align 4 +/* Handle copies of 32+ bytes where DST is aligned (to quadword) but + SRC is not. Use aligned quadword loads from SRC, shifted to realign + the data, allowing for aligned DST stores. */ + .align 4 L(copy_GE_32_unaligned): - clrldi 0,0,60 /* Number of bytes until the 1st - quadword. */ - andi. 11,3,15 /* Check alignment of DST (against - quadwords). */ - srdi 9,5,4 /* Number of full quadwords remaining. */ + clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ +#ifndef __LITTLE_ENDIAN__ + andi. 10,3,15 /* Check alignment of DST (against quadwords). */ +#endif + srdi 9,cnt,4 /* Number of full quadwords remaining. */ beq L(copy_GE_32_unaligned_cont) - /* SRC is not quadword aligned, get it aligned. */ + /* DST is not quadword aligned, get it aligned. */ - mtcrf 0x01,0 - subf 31,0,5 + mtocrf 0x01,0 + subf cnt,0,cnt /* Vector instructions work best when proper alignment (16-bytes) is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ -1: /* Copy 1 byte. */ +1: bf 31,2f - - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 -2: /* Copy 2 bytes. */ + lbz 6,0(src) + addi src,src,1 + stb 6,0(dst) + addi dst,dst,1 +2: bf 30,4f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -4: /* Copy 4 bytes. */ + lhz 6,0(src) + addi src,src,2 + sth 6,0(dst) + addi dst,dst,2 +4: bf 29,8f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -8: /* Copy 8 bytes. */ + lwz 6,0(src) + addi src,src,4 + stw 6,0(dst) + addi dst,dst,4 +8: bf 28,0f - - ld 6,0(12) - addi 12,12,8 - std 6,0(3) - addi 3,3,8 + ld 6,0(src) + addi src,src,8 + std 6,0(dst) + addi dst,dst,8 0: - clrldi 10,12,60 /* Check alignment of SRC. */ - srdi 9,31,4 /* Number of full quadwords remaining. */ + srdi 9,cnt,4 /* Number of full quadwords remaining. */ /* The proper alignment is present, it is OK to copy the bytes now. */ L(copy_GE_32_unaligned_cont): /* Setup two indexes to speed up the indexed vector operations. */ - clrldi 11,31,60 - li 6,16 /* Index for 16-bytes offsets. */ + clrldi 10,cnt,60 + li 6,16 /* Index for 16-bytes offsets. */ li 7,32 /* Index for 32-bytes offsets. */ - cmpldi cr1,11,0 - srdi 8,31,5 /* Setup the loop counter. */ - mr 10,3 - mr 11,12 - mtcrf 0x01,9 - cmpldi cr6,9,1 - lvsl 5,0,12 - lvx 3,0,12 - bf 31,L(setup_unaligned_loop) - - /* Copy another 16 bytes to align to 32-bytes due to the loop . */ - lvx 4,12,6 - vperm 6,3,4,5 - addi 11,12,16 - addi 10,3,16 - stvx 6,0,3 + cmpldi cr1,10,0 + srdi 8,cnt,5 /* Setup the loop counter. */ + mtocrf 0x01,9 + cmpldi cr6,9,1 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,src +#else + lvsl 5,0,src +#endif + lvx 3,0,src + li 0,0 + bf 31,L(setup_unaligned_loop) + + /* Copy another 16 bytes to align to 32-bytes due to the loop. */ + lvx 4,src,6 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif + addi src,src,16 + stvx 6,0,dst + addi dst,dst,16 vor 3,4,4 + clrrdi 0,src,60 L(setup_unaligned_loop): - mtctr 8 - ble cr6,L(end_unaligned_loop) + mtctr 8 + ble cr6,L(end_unaligned_loop) /* Copy 32 bytes at a time using vector instructions. */ - .align 4 + .align 4 L(unaligned_loop): /* Note: vr6/vr10 may contain data that was already copied, @@ -442,62 +385,55 @@ L(unaligned_loop): some portions again. This is faster than having unaligned vector instructions though. */ - lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ - lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ - addi 11,11,32 - stvx 6,0,10 - stvx 10,10,6 - addi 10,10,32 - + lvx 4,src,6 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif + lvx 3,src,7 +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif + addi src,src,32 + stvx 6,0,dst + stvx 10,dst,6 + addi dst,dst,32 bdnz L(unaligned_loop) - .align 4 + clrrdi 0,src,60 + + .align 4 L(end_unaligned_loop): /* Check for tail bytes. */ - rldicr 0,31,0,59 - mtcrf 0x01,31 - beq cr1,0f + mtocrf 0x01,cnt + beqlr cr1 - add 3,3,0 - add 12,12,0 + add src,src,0 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ -8: /* Copy 8 bytes. */ + /* Copy 8 bytes. */ bf 28,4f - - lwz 6,0(12) - lwz 7,4(12) - addi 12,12,8 - stw 6,0(3) - stw 7,4(3) - addi 3,3,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2~3 bytes. */ - bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,0(12) - stb 6,0(3) -0: /* Return original DST pointer. */ - ld 31,-8(1) - ld 3,-16(1) + lwz 6,0(src) + lwz 7,4(src) + addi src,src,8 + stw 6,0(dst) + stw 7,4(dst) + addi dst,dst,8 +4: /* Copy 4~7 bytes. */ + bf 29,L(tail2) + lwz 6,0(src) + stw 6,0(dst) + bf 30,L(tail5) + lhz 7,4(src) + sth 7,4(dst) + bflr 31 + lbz 8,6(src) + stb 8,6(dst) + /* Return original DST pointer. */ blr END_GEN_TB (memcpy,TB_TOCLESS) diff --git a/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/sysdeps/powerpc/powerpc64/power7/mempcpy.S index f20be938d2..b93ab7da52 100644 --- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S +++ b/sysdeps/powerpc/powerpc64/power7/mempcpy.S @@ -365,13 +365,21 @@ L(copy_GE_32_unaligned_cont): mr 11,12 mtcrf 0x01,9 cmpldi cr6,9,1 - lvsl 5,0,12 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,12 +#else + lvsl 5,0,12 +#endif lvx 3,0,12 bf 31,L(setup_unaligned_loop) /* Copy another 16 bytes to align to 32-bytes due to the loop . */ lvx 4,12,6 - vperm 6,3,4,5 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif addi 11,12,16 addi 10,3,16 stvx 6,0,3 @@ -391,11 +399,17 @@ L(unaligned_loop): vector instructions though. */ lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif addi 11,11,32 stvx 6,0,10 stvx 10,10,6 -- cgit 1.4.1