From 679e4c434f755644cc2093c9940ac58d0c2b51cf Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 11 Oct 2002 07:22:18 +0000 Subject: * locale/newlocale.c (__newlocale): If setting all categories to "C", just return &_nl_C_locobj instead of copying it. * locale/freelocale.c (__freelocale): Check for &_nl_C_locobj. * locale/duplocale.c (__duplocale): Likewise. 2002-10-07 Roland McGrath * config.h.in (HAVE_I386_SET_GDT): New #undef. * sysdeps/mach/configure.in: Define it with new check for i386_set_gdt. * sysdeps/mach/configure: Regenerated. 2002-10-06 Franz Sirl * sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h (INLINE_SYSCALL): Add all necessary register outputs for syscall-clobbered registers. 2002-10-02 David Mosberger * sysdeps/ia64/bzero.S: Rewritten by Sverre Jarp to tune for Itanium 2 (and Itanium). Fix unwind directives and make it fit in 80 columns. * sysdeps/ia64/memset.S: Ditto. * sysdeps/ia64/memcpy.S: Ditto. Move jump table to .rodata section. 2002-10-03 Roland McGrath * sysdeps/mach/hurd/i386/init-first.c (_hurd_stack_setup): Add clobbers to asm. --- sysdeps/ia64/memcpy.S | 470 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 345 insertions(+), 125 deletions(-) (limited to 'sysdeps/ia64/memcpy.S') diff --git a/sysdeps/ia64/memcpy.S b/sysdeps/ia64/memcpy.S index b49f99e286..257e1aa341 100644 --- a/sysdeps/ia64/memcpy.S +++ b/sysdeps/ia64/memcpy.S @@ -1,7 +1,8 @@ /* Optimized version of the standard memcpy() function. This file is part of the GNU C Library. Copyright (C) 2000, 2001 Free Software Foundation, Inc. - Contributed by Dan Pop . + Contributed by Dan Pop for Itanium . + Rewritten for McKinley by Sverre Jarp, HP Labs/CERN The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -26,27 +27,39 @@ in2: byte count An assembly implementation of the algorithm used by the generic C - version from glibc. The case when all three arguments are multiples - of 8 is treated separatedly, for extra performance. + version from glibc. The case when source and sest are aligned is + treated separately, for extra performance. - In this form, it assumes little endian mode. For big endian mode, + In this form, memcpy assumes little endian mode. For big endian mode, sh1 must be computed using an extra instruction: sub sh1 = 64, sh1 and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the shrp instruction. */ +#define USE_LFETCH +#define USE_FLP #include #undef ret +#define LFETCH_DIST 500 + +#define ALIGN_UNROLL_no 4 // no. of elements +#define ALIGN_UNROLL_sh 2 // (shift amount) + +#define MEMLAT 8 +#define Nrot ((4*(MEMLAT+2) + 7) & ~7) + #define OP_T_THRES 16 #define OPSIZ 8 -#define adest r15 -#define saved_pr r17 -#define saved_lc r18 +#define loopcnt r14 +#define elemcnt r15 +#define saved_pr r16 +#define saved_lc r17 +#define adest r18 #define dest r19 -#define src r20 -#define len r21 -#define asrc r22 +#define asrc r20 +#define src r21 +#define len r22 #define tmp2 r23 #define tmp3 r24 #define tmp4 r25 @@ -54,113 +67,339 @@ #define ploop56 r27 #define loopaddr r28 #define sh1 r29 -#define loopcnt r30 -#define value r31 - -#define LOOP(shift) \ - .align 32 ; \ -.loop##shift##: \ -(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \ -(p[MEMLAT+1]) st8 [dest] = value, 8 ; \ -(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \ - nop.b 0 ; \ - nop.b 0 ; \ - br.ctop.sptk .loop##shift ; \ - br.cond.sptk .cpyfew ; /* deal with the remaining bytes */ - -#define MEMLAT 21 -#define Nrot (((2*MEMLAT+3) + 7) & ~7) +#define ptr1 r30 +#define ptr2 r31 + +#define movi0 mov + +#define p_scr p6 +#define p_xtr p7 +#define p_nxtr p8 +#define p_few p9 + +#if defined(USE_FLP) +#define load ldf8 +#define store stf8 +#define tempreg f6 +#define the_r fr +#define the_s fs +#define the_t ft +#define the_q fq +#define the_w fw +#define the_x fx +#define the_y fy +#define the_z fz +#elif defined(USE_INT) +#define load ld8 +#define store st8 +#define tempreg tmp2 +#define the_r r +#define the_s s +#define the_t t +#define the_q q +#define the_w w +#define the_x x +#define the_y y +#define the_z z +#endif + + +#if defined(USE_LFETCH) +#define LOOP(shift) \ + .align 32 ; \ +.loop##shift##: \ +{ .mmb \ +(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ +(p[0]) lfetch.nt1 [ptr1], 16 ; \ + nop.b 0 ; \ +} { .mib \ +(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ +(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ + nop.b 0 ;; \ + } { .mmb \ +(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ +(p[0]) lfetch.nt1 [ptr2], 16 ; \ + nop.b 0 ; \ +} { .mib \ +(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ +(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ + br.ctop.sptk.many .loop##shift \ +;; } \ +{ .mib \ + br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ +} +#else +#define LOOP(shift) \ + .align 32 ; \ +.loop##shift##: \ +{ .mmb \ +(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ + nop.b 0 ; \ +} { .mib \ +(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \ +(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \ + nop.b 0 ;; \ + } { .mmb \ +(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \ + nop.b 0 ; \ +} { .mib \ +(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \ +(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \ + br.ctop.sptk.many .loop##shift \ +;; } \ +{ .mib \ + br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \ +} +#endif + ENTRY(memcpy) +{ .mmi .prologue alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot - .rotr r[MEMLAT + 2], q[MEMLAT + 1] - .rotp p[MEMLAT + 2] - mov ret0 = in0 // return value = dest - .save pr, saved_pr - mov saved_pr = pr // save the predicate registers - .save ar.lc, saved_lc - mov saved_lc = ar.lc // save the loop counter - .body - or tmp3 = in0, in1 ;; // tmp3 = dest | src - or tmp3 = tmp3, in2 // tmp3 = dest | src | len + .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1] + .rotp p[MEMLAT+2] + .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1] + mov ret0 = in0 // return tmp2 = dest + .save pr, saved_pr + movi0 saved_pr = pr // save the predicate registers +} { .mmi + and tmp4 = 7, in0 // check if destination is aligned mov dest = in0 // dest mov src = in1 // src +;; } +{ .mii + cmp.eq p_scr, p0 = in2, r0 // if (len == 0) + .save ar.lc, saved_lc + movi0 saved_lc = ar.lc // save the loop counter + .body + cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH +} { .mbb mov len = in2 // len - sub tmp2 = r0, in0 // tmp2 = -dest - cmp.eq p6, p0 = in2, r0 // if (len == 0) -(p6) br.cond.spnt .restore_and_exit;;// return dest; - and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7 - shr.u loopcnt = len, 4 ;; // loopcnt = len / 16 - cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0) -(p6) br.cond.sptk .next // goto next; - -// The optimal case, when dest, src and len are all multiples of 8 - - and tmp3 = 0xf, len // tmp3 = len % 16 - mov pr.rot = 1 << 16 // set rotating predicates - mov ar.ec = MEMLAT + 1 ;; // set the epilog counter - cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word? - adds loopcnt = -1, loopcnt;; // --loopcnt -(p6) ld8 value = [src], 8;; -(p6) st8 [dest] = value, 8 // copy the "extra" word - mov ar.lc = loopcnt // set the loop counter - cmp.eq p6, p0 = 8, len -(p6) br.cond.spnt .restore_and_exit;;// there was only one word to copy - adds adest = 8, dest - adds asrc = 8, src ;; +(p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest +(p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte +;; } +{ .mmi +#if defined(USE_LFETCH) + lfetch.nt1 [dest] // + lfetch.nt1 [src] // +#endif + shr.u elemcnt = len, 3 // elemcnt = len / 8 +} { .mib + cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned? + sub loopcnt = 7, tmp4 // +(p_scr) br.cond.dptk.many .dest_aligned +;; } +{ .mmi + ld1 tmp2 = [src], 1 // + sub len = len, loopcnt, 1 // reduce len + movi0 ar.lc = loopcnt // +} { .mib + cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point +;; } + +.l0: // ---------------------------- // L0: Align src on 8-byte boundary +{ .mmi + st1 [dest] = tmp2, 1 // +(p_scr) ld1 tmp2 = [src], 1 // +} { .mib + cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point + add loopcnt = -1, loopcnt + br.cloop.dptk.few .l0 // +;; } + +.dest_aligned: +{ .mmi + and tmp4 = 7, src // ready for alignment check + shr.u elemcnt = len, 3 // elemcnt = len / 8 +;; } +{ .mib + cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned + tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src +} { .mib // is not 16B aligned + add ptr2 = LFETCH_DIST, dest // prefetch address + add ptr1 = LFETCH_DIST, src +(p_scr) br.cond.dptk.many .src_not_aligned +;; } + +// The optimal case, when dest, and src are aligned + +.both_aligned: +{ .mmi + .pred.rel "mutex",p_xtr,p_nxtr +(p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify +(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify + movi0 pr.rot = 1 << 16 // set rotating predicates +} { .mib +(p_scr) br.cond.dpnt.many .copy_full_words +;; } + +{ .mmi +(p_xtr) load tempreg = [src], 8 +(p_xtr) add elemcnt = -1, elemcnt + movi0 ar.ec = MEMLAT + 1 // set the epilog counter +;; } +{ .mmi +(p_xtr) add len = -8, len // + add asrc = 16, src // one bank apart (for USE_INT) + shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling +;;} +{ .mmi + add loopcnt = -1, loopcnt +(p_xtr) store [dest] = tempreg, 8 // copy the "extra" word + nop.i 0 +;; } +{ .mib + add adest = 16, dest + movi0 ar.lc = loopcnt // set the loop counter +;; } + .align 32 -.l0: -(p[0]) ld8 r[0] = [src], 16 -(p[0]) ld8 q[0] = [asrc], 16 -(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16 -(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16 - br.ctop.dptk .l0 ;; - - mov pr = saved_pr, -1 // restore the predicate registers - mov ar.lc = saved_lc // restore the loop counter +#if defined(USE_FLP) +.l1: // ------------------------------- // L1: Everything a multiple of 8 +{ .mmi +#if defined(USE_LFETCH) +(p[0]) lfetch.nt1 [ptr2],32 +#endif +(p[0]) ldfp8 the_r[0],the_q[0] = [src], 16 +(p[0]) add len = -32, len +} {.mmb +(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 +(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8 +;; } +{ .mmi +#if defined(USE_LFETCH) +(p[0]) lfetch.nt1 [ptr1],32 +#endif +(p[0]) ldfp8 the_s[0], the_t[0] = [src], 16 +} {.mmb +(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24 +(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 + br.ctop.dptk.many .l1 +;; } +#elif defined(USE_INT) +.l1: // ------------------------------- // L1: Everything a multiple of 8 +{ .mmi +(p[0]) load the_r[0] = [src], 8 +(p[0]) load the_q[0] = [asrc], 8 +(p[0]) add len = -32, len +} {.mmb +(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8 +(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8 +;; } +{ .mmi +(p[0]) load the_s[0] = [src], 24 +(p[0]) load the_t[0] = [asrc], 24 +} {.mmb +(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24 +(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24 +#if defined(USE_LFETCH) +;; } +{ .mmb +(p[0]) lfetch.nt1 [ptr2],32 +(p[0]) lfetch.nt1 [ptr1],32 +#endif + br.ctop.dptk.many .l1 +;; } +#endif + +.copy_full_words: +{ .mib + cmp.gt p_scr, p0 = 8, len // + shr.u elemcnt = len, 3 // +(p_scr) br.cond.dpnt.many .copy_bytes +;; } +{ .mii + load tempreg = [src], 8 + add loopcnt = -1, elemcnt // +;; } +{ .mii + cmp.ne p_scr, p0 = 0, loopcnt // + mov ar.lc = loopcnt // +;; } + +.l2: // ------------------------------- // L2: Max 4 words copied separately +{ .mmi + store [dest] = tempreg, 8 +(p_scr) load tempreg = [src], 8 // + add len = -8, len +} { .mib + cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point + add loopcnt = -1, loopcnt + br.cloop.dptk.few .l2 +;; } + +.copy_bytes: +{ .mib + cmp.eq p_scr, p0 = len, r0 // is len == 0 ? + add loopcnt = -1, len // len--; +(p_scr) br.cond.spnt .restore_and_exit +;; } +{ .mii + ld1 tmp2 = [src], 1 + movi0 ar.lc = loopcnt + cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point +;; } + +.l3: // ------------------------------- // L3: Final byte move +{ .mmi + st1 [dest] = tmp2, 1 +(p_scr) ld1 tmp2 = [src], 1 +} { .mib + cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point + add loopcnt = -1, loopcnt + br.cloop.dptk.few .l3 +;; } + +.restore_and_exit: +{ .mmi + movi0 pr = saved_pr, -1 // restore the predicate registers +;; } +{ .mib + movi0 ar.lc = saved_lc // restore the loop counter br.ret.sptk.many b0 -.next: - cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES - and loopcnt = 7, tmp2 // loopcnt = -dest % 8 -(p6) br.cond.spnt .cpyfew // copy byte by byte - ;; - cmp.eq p6, p0 = loopcnt, r0 -(p6) br.cond.sptk .dest_aligned - sub len = len, loopcnt // len -= -dest % 8 - adds loopcnt = -1, loopcnt // --loopcnt - ;; - mov ar.lc = loopcnt -.l1: // copy -dest % 8 bytes - ld1 value = [src], 1 // value = *src++ - ;; - st1 [dest] = value, 1 // *dest++ = value - br.cloop.dptk .l1 ;; -.dest_aligned: +;; } + + +.src_not_aligned: +{ .mmi + cmp.gt p_scr, p0 = 16, len and sh1 = 7, src // sh1 = src % 8 - and tmp2 = -8, len // tmp2 = len & -OPSIZ - and asrc = -8, src // asrc = src & -OPSIZ -- align src - shr.u loopcnt = len, 3 // loopcnt = len / 8 - and len = 7, len;; // len = len % 8 - adds loopcnt = -1, loopcnt // --loopcnt - addl tmp4 = @ltoff(.table), gp - addl tmp3 = @ltoff(.loop56), gp - mov ar.ec = MEMLAT + 1 // set EC - mov pr.rot = 1 << 16;; // set rotating predicates - mov ar.lc = loopcnt // set LC - cmp.eq p6, p0 = sh1, r0 // is the src aligned? -(p6) br.cond.sptk .src_aligned - add src = src, tmp2 // src += len & -OPSIZ + shr.u loopcnt = len, 4 // element-cnt = len / 16 +} { .mib + add tmp4 = @ltoff(.table), gp + add tmp3 = @ltoff(.loop56), gp +(p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few +;; } +{ .mmi + and asrc = -8, src // asrc = (-8) -- align src for loop + add loopcnt = -1, loopcnt // loopcnt-- shl sh1 = sh1, 3 // sh1 = 8 * (src % 8) +} { .mmi + ld8 ptable = [tmp4] // ptable = &table ld8 ploop56 = [tmp3] // ploop56 = &loop56 - ld8 ptable = [tmp4];; // ptable = &table - add tmp3 = ptable, sh1;; // tmp3 = &table + sh1 - mov ar.ec = MEMLAT + 1 + 1 // one more pass needed - ld8 tmp4 = [tmp3];; // tmp4 = loop offset + and tmp2 = -16, len // tmp2 = len & -OPSIZ +;; } +{ .mmi + add tmp3 = ptable, sh1 // tmp3 = &table + sh1 + add src = src, tmp2 // src += len & (-16) + movi0 ar.lc = loopcnt // set LC +;; } +{ .mmi + ld8 tmp4 = [tmp3] // tmp4 = loop offset + sub len = len, tmp2 // len -= len & (-16) + movi0 ar.ec = MEMLAT + 2 // one more pass needed +;; } +{ .mmi + ld8 s[1] = [asrc], 8 // preload sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset - ld8 r[1] = [asrc], 8;; // w0 - mov b6 = loopaddr;; + movi0 pr.rot = 1 << 16 // set rotating predicates +;; } +{ .mib + nop.m 0 + movi0 b6 = loopaddr br b6 // jump to the appropriate loop +;; } LOOP(8) LOOP(16) @@ -169,26 +408,9 @@ ENTRY(memcpy) LOOP(40) LOOP(48) LOOP(56) - -.src_aligned: -.l3: -(p[0]) ld8 r[0] = [src], 8 -(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8 - br.ctop.dptk .l3 ;; -.cpyfew: - cmp.eq p6, p0 = len, r0 // is len == 0 ? - adds len = -1, len // --len; -(p6) br.cond.spnt .restore_and_exit ;; - mov ar.lc = len -.l4: - ld1 value = [src], 1 - ;; - st1 [dest] = value, 1 - br.cloop.dptk .l4 ;; -.restore_and_exit: - mov pr = saved_pr, -1 // restore the predicate registers - mov ar.lc = saved_lc // restore the loop counter - br.ret.sptk.many b0 +END(memcpy) + + .rodata .align 8 .table: data8 0 // dummy entry @@ -199,5 +421,3 @@ ENTRY(memcpy) data8 .loop56 - .loop40 data8 .loop56 - .loop48 data8 .loop56 - .loop56 - -END(memcpy) -- cgit 1.4.1