diff options
Diffstat (limited to 'ports/sysdeps/tile/tilegx/memcpy.c')
-rw-r--r-- | ports/sysdeps/tile/tilegx/memcpy.c | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/ports/sysdeps/tile/tilegx/memcpy.c b/ports/sysdeps/tile/tilegx/memcpy.c new file mode 100644 index 0000000000..dd6e30dd60 --- /dev/null +++ b/ports/sysdeps/tile/tilegx/memcpy.c @@ -0,0 +1,206 @@ +/* Copyright (C) 2011-2012 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <string.h> +#include <stdint.h> +#include <stdlib.h> +#include <arch/chip.h> + +/* Must be 8 bytes in size. */ +#define word_t uint64_t + +/* How many cache lines ahead should we prefetch? */ +#define PREFETCH_LINES_AHEAD 3 + +void * +__memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n) +{ + char *__restrict dst1 = (char *) dstv; + const char *__restrict src1 = (const char *) srcv; + const char *__restrict src1_end; + const char *__restrict prefetch; + word_t *__restrict dst8; /* 8-byte pointer to destination memory. */ + word_t final; /* Final bytes to write to trailing word, if any */ + long i; + + if (n < 16) + { + for (; n; n--) + *dst1++ = *src1++; + return dstv; + } + + /* Locate the end of source memory we will copy. Don't prefetch + past this. */ + src1_end = src1 + n - 1; + + /* Prefetch ahead a few cache lines, but not past the end. */ + prefetch = src1; + for (i = 0; i < PREFETCH_LINES_AHEAD; i++) + { + __insn_prefetch (prefetch); + prefetch += CHIP_L2_LINE_SIZE (); + prefetch = (prefetch > src1_end) ? prefetch : src1; + } + + /* Copy bytes until dst is word-aligned. */ + for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--) + *dst1++ = *src1++; + + /* 8-byte pointer to destination memory. */ + dst8 = (word_t *) dst1; + + if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0)) + { + /* Misaligned copy. Copy 8 bytes at a time, but don't bother + with other fanciness. + TODO: Consider prefetching and using wh64 as well. */ + + /* Create an aligned src8. */ + const word_t *__restrict src8 = + (const word_t *) ((uintptr_t) src1 & -sizeof (word_t)); + word_t b; + + word_t a = *src8++; + for (; n >= sizeof (word_t); n -= sizeof (word_t)) + { + b = *src8++; + a = __insn_dblalign (a, b, src1); + *dst8++ = a; + a = b; + } + + if (n == 0) + return dstv; + + b = ((const char *) src8 <= src1_end) ? *src8 : 0; + + /* Final source bytes to write to trailing partial word, if any. */ + final = __insn_dblalign (a, b, src1); + } + else + { + /* Aligned copy. */ + + const word_t *__restrict src8 = (const word_t *) src1; + + /* src8 and dst8 are both word-aligned. */ + if (n >= CHIP_L2_LINE_SIZE ()) + { + /* Copy until 'dst' is cache-line-aligned. */ + for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1); + n -= sizeof (word_t)) + *dst8++ = *src8++; + + /* If copying to self, return. The test is cheap enough + that we do it despite the fact that the memcpy() contract + doesn't require us to support overlapping dst and src. + This is the most common case of overlap, and any close + overlap will cause corruption due to the wh64 below. + This case is particularly important since the compiler + will emit memcpy() calls for aggregate copies even if it + can't prove that src != dst. */ + if (__builtin_expect (dst8 == src8, 0)) + return dstv; + + for (; n >= CHIP_L2_LINE_SIZE ();) + { + __insn_wh64 (dst8); + + /* Prefetch and advance to next line to prefetch, but + don't go past the end. */ + __insn_prefetch (prefetch); + prefetch += CHIP_L2_LINE_SIZE (); + prefetch = (prefetch > src1_end) ? prefetch : + (const char *) src8; + + /* Copy an entire cache line. Manually unrolled to + avoid idiosyncracies of compiler unrolling. */ +#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; }) + COPY_WORD (0); + COPY_WORD (1); + COPY_WORD (2); + COPY_WORD (3); + COPY_WORD (4); + COPY_WORD (5); + COPY_WORD (6); + COPY_WORD (7); +#if CHIP_L2_LINE_SIZE() != 64 +# error "Fix code that assumes particular L2 cache line size." +#endif + + dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t); + src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t); + } + } + + for (; n >= sizeof (word_t); n -= sizeof (word_t)) + *dst8++ = *src8++; + + if (__builtin_expect (n == 0, 1)) + return dstv; + + final = *src8; + } + + /* n != 0 if we get here. Write out any trailing bytes. */ + dst1 = (char *) dst8; +#ifndef __BIG_ENDIAN__ + if (n & 4) + { + *(uint32_t *) dst1 = final; + dst1 += 4; + final >>= 32; + n &= 3; + } + if (n & 2) + { + *(uint16_t *) dst1 = final; + dst1 += 2; + final >>= 16; + n &= 1; + } + if (n) + *(uint8_t *) dst1 = final; +#else + if (n & 4) + { + *(uint32_t *) dst1 = final >> 32; + dst1 += 4; + } + else + { + final >>= 32; + } + if (n & 2) + { + *(uint16_t *) dst1 = final >> 16; + dst1 += 2; + } + else + { + final >>= 16; + } + if (n & 1) + *(uint8_t *) dst1 = final >> 8; +#endif + + return dstv; +} +weak_alias (__memcpy, memcpy) +libc_hidden_builtin_def (memcpy) |