about summary refs log tree commit diff
path: root/sysdeps/powerpc/powerpc64/power7/strncat.S
blob: 05502acbbf14771db9998788bbd83c3bb445727c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/* Optimized strncat implementation for PowerPC64/POWER7.

   Copyright (C) 2014-2015 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

/* The algorithm is as follows for aligned memory access :

   if address of s2 is divisible by 0x7UL,
       perform aligned doubleword catenation
   else
       perform unaligned catenation

   The aligned comparison are made using cmpb instructions.  */

/* char* [r3] strncat (const char *s1 [r3],
                       const char *s2 [r4],
                       size_t size [r5])  */

#include <sysdep.h>

#ifndef STRNCAT
# undef strncat
# define STRNCAT  strncat
#endif

#ifndef STRLEN
/* For builds with no IFUNC support, local calls should be made to internal
   GLIBC symbol (created by libc_hidden_builtin_def).  */
# ifdef SHARED
#  define STRLEN   __GI_strlen
# else
#  define STRLEN   strlen
# endif
#endif

#define	FRAMESIZE	(FRAME_MIN_SIZE+32)

	.machine  power7
EALIGN(STRNCAT, 4, 0)
	CALL_MCOUNT 3

	mflr r0				/* Load link register LR to r0.  */

/* We shall use r29, r30 and r31 non volatile register for retention.
   Save all the callee registers in the GPR save area.  */
	std r29, -24(r1)		/* Save callers register r29.  */
	std r30, -16(r1)		/* Save callers register r30.  */
	std r31, -8(r1)			/* Save callers register r31.  */

	std r0, 16(r1)			/* Store the link register.  */
	stdu r1, -FRAMESIZE(r1)		/* Create the stack frame.  */

/* Improve performance with CPU pre-fetch.  */
	dcbt 0, r3			/* Pre-fetch str to avoid cache
					   miss.  */
	dcbt 0, r4			/* Pre-fetch accept to avoid cache
					   miss.  */

	mr. r29, r5			/* Save "n" in r29.  */
	mr r30, r3			/* Save "s1" in r30 from r3.  */
	beq cr0,L(done)

	mr r31, r4			/* Save "s2" in r31 from r4.  */
	bl STRLEN			/* Call optimized strlen on s1; goto
					   end of s1.  */
	nop
	cmpldi cr7, r29, 7		/* If s2 is <=7 process
					    byte-by-byte.  */
	add r3, r30, r3			/* Grab the last character of s1.  */
	bgt cr7,L(alignment)		/* Process by aligned strings.  */

	cmpldi cr7, r29, 3		/* If n is >= 4, we can
					   byte-unroll.  */
	addi r9, r3, -1			/* Make "s1" point before next
					   character, increment when read.  */
	bgt cr7, L(bytes_unroll)	/* Process each byte.  */

L(byte_by_byte):
	lbz r10, 0(r31)
	addi r8, r9, 1
	cmpdi cr7, r10, 0		/* Check for NULL in "s2".  */
	stb r10, 1(r9)
	beq cr7, L(done)
	add r9, r9, r29
	subf r9, r8, r9
	addi r9, r9, 1
	mtctr r9
	b L(branch2)
	.p2align 4
L(branch1):
	lbzu r10, 1(r31)
	cmpdi cr7, r10, 0
	stbu r10, 1(r8)
	beq cr7,L(done)
L(branch2):
	mr r9, r8
	bdnz L(branch1)
	beq cr7,L(done)
L(nullTerminate):
	li r10, 0			/* Load NULL for termination.  */
	stb r10, 1(r9)			/* Append or terminate s1 with
					   NULL.  */
	.p2align 4			/* A small section here.  */
L(done):				/* We return now.   */
	addi r1, r1, FRAMESIZE		/* Restore stack pointer.  */
	mr r3, r30			/* Set the return value length of
					   string.  */
	ld r0, 16(r1)			/* Read the saved link register.  */
	ld r29, -24(r1)			/* Restore save register r29.  */
	ld r30, -16(r1)			/* Restore save register r30.  */
	ld r31, -8(r1)			/* Restore save register r31.  */
	mtlr r0				/* Restore link register.  */
	blr				/* Branch to link register.  */

	.p2align 4
L(alignment):
	rldicl. r9, r31, 0, 61		/* Check if s2 is 8byte aligned  */
	beq cr0,L(dwordAligned)

	.p2align 4
/* Unaligned bytes in string, so process byte by byte.
   POWER7 has performance gains over loop unroll.  */
L(bytes_unroll):
	addi r9, r3, -1
	srdi r10, r29, 2
	mtctr r10
	b L(L10)
	.p2align 4
L(L44):
	lbz r10, 1(r31)			/* Load byte.  */
	cmpdi cr7, r10, 0		/* Compare ; if byte not zero,
					   continue.  */
	stb r10, 2(r9)			/* Store byte  */
	beq cr7, L(done)
	addi r31, r31, 4

	lbz r10, -2(r31)		/* Perform loop unroll here on byte
					   load and store.  */
	cmpdi cr7, r10, 0
	stb r10, 3(r9)
	beq cr7, L(done)

	lbz r10, -1(r31)		/* Loop unroll here.  */
	cmpdi cr7, r10, 0
	stbu r10, 4(r9)
	beq cr7, L(done)

	bdz L(leftNbytes)

L(L10):
	lbz r10, 0(r31)			/* Loop unroll here.  */
	cmpdi cr7, r10, 0
	stb r10, 1(r9)
	bne cr7,L(L44)
	b L(done)
	.p2align 4
/* If s2 is double word aligned, we load and store double word.  */
L(dwordAligned):
/* read, write 8 bytes at a time  */
	srdi r8, r29, 3			/* Compute count for CTR to loop;
					   count = n/8.  */
	li r7, 0			/* Load r7 with NULL.  */
	li r10, 0			/* Load r10 with MASK '0'.  */

	mtctr r8			/* Move count to CTR.  */
L(loop8):
	ld r9, 0(r31)			/* Read double word from s2.  */
	cmpb r6, r9, r10		/* Compare bytes in s2 we read
					   just now.  */
	cmpdi r6, 0			/* If cmpb returned NULL,
					   we continue.  */
	bne+ L(a8)
	std r9, 0(r3)			/* Append double word from s2
					   with s1.  */
	addi r3, r3, 8			/* Increment s1.  */
	addi r31, r31, 8		/* Increment s2.  */
	subi r29, r29, 8		/* Decrement count by 8.  */
	bdnz L(loop8)			/* Continue until "count" is
					   non zero.  */

L(a8):
	cmpdi r29, 0			/* If "n" is already zero, we skip. */
	beq+ L(align8align)

	mtctr r29			/* Process left over bytes in "n".  */
L(unaligned0):
	lbz r9, 0(r31)			/* Read a byte from s2.  */
	cmpw r9, r7			/* If byte is NULL, we stop here . */
	beq+ L(align8align)		/* Skip processing further if NULL.  */
	stb  r9, 0(r3)			/* If not NULL, store byte into s1.  */
	addi r3, r3, 1			/* Increment s1 by 1.  */
	addi r31, r31, 1		/* Increment s2 by 1.  */
	bdnz L(unaligned0)		/* Decrement counter "n" and loop
					   until non zero.  */
L(align8align):
	stb r7, 0(r3)			/* Terminate s1 with NULL.  */

	addi r1, r1, FRAMESIZE		/* Restore stack pointer.  */
	mr r3, r30			/* Set the return value, length of
					   string.  */
	ld r0, 16(r1)			/* Read the saved link register.  */
	ld r29, -24(r1)			/* Restore save register r29.  */
	ld r30, -16(r1)			/* Restore save register r30.  */
	ld r31, -8(r1)			/* Restore save register r31.  */
	mtlr r0				/* Restore link register.  */
	blr				/* Branch to link register  */

	.p2align 4
L(leftNbytes):
	rldicl. r29, r29, 0, 62		/* Check if n>0 and n < 4 bytes.  */
	bne cr0,L(byte_by_byte)		/* Process bytes one by one. */
	b L(nullTerminate)		/* Now, finish catenation with
					   NULL termination.  */
END(STRNCAT)