summary refs log tree commit diff
path: root/ports/sysdeps/tile/tilegx/memcpy.c
diff options
context:
space:
mode:
Diffstat (limited to 'ports/sysdeps/tile/tilegx/memcpy.c')
-rw-r--r--ports/sysdeps/tile/tilegx/memcpy.c206
1 files changed, 206 insertions, 0 deletions
diff --git a/ports/sysdeps/tile/tilegx/memcpy.c b/ports/sysdeps/tile/tilegx/memcpy.c
new file mode 100644
index 0000000000..dd6e30dd60
--- /dev/null
+++ b/ports/sysdeps/tile/tilegx/memcpy.c
@@ -0,0 +1,206 @@
+/* Copyright (C) 2011-2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <arch/chip.h>
+
+/* Must be 8 bytes in size. */
+#define word_t uint64_t
+
+/* How many cache lines ahead should we prefetch? */
+#define PREFETCH_LINES_AHEAD 3
+
+void *
+__memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
+{
+  char *__restrict dst1 = (char *) dstv;
+  const char *__restrict src1 = (const char *) srcv;
+  const char *__restrict src1_end;
+  const char *__restrict prefetch;
+  word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
+  word_t final; /* Final bytes to write to trailing word, if any */
+  long i;
+
+  if (n < 16)
+    {
+      for (; n; n--)
+        *dst1++ = *src1++;
+      return dstv;
+    }
+
+  /* Locate the end of source memory we will copy.  Don't prefetch
+     past this.  */
+  src1_end = src1 + n - 1;
+
+  /* Prefetch ahead a few cache lines, but not past the end. */
+  prefetch = src1;
+  for (i = 0; i < PREFETCH_LINES_AHEAD; i++)
+    {
+      __insn_prefetch (prefetch);
+      prefetch += CHIP_L2_LINE_SIZE ();
+      prefetch = (prefetch > src1_end) ? prefetch : src1;
+    }
+
+  /* Copy bytes until dst is word-aligned. */
+  for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
+    *dst1++ = *src1++;
+
+  /* 8-byte pointer to destination memory. */
+  dst8 = (word_t *) dst1;
+
+  if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
+    {
+      /* Misaligned copy.  Copy 8 bytes at a time, but don't bother
+         with other fanciness.
+         TODO: Consider prefetching and using wh64 as well.  */
+
+      /* Create an aligned src8. */
+      const word_t *__restrict src8 =
+        (const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
+      word_t b;
+
+      word_t a = *src8++;
+      for (; n >= sizeof (word_t); n -= sizeof (word_t))
+        {
+          b = *src8++;
+          a = __insn_dblalign (a, b, src1);
+          *dst8++ = a;
+          a = b;
+        }
+
+      if (n == 0)
+        return dstv;
+
+      b = ((const char *) src8 <= src1_end) ? *src8 : 0;
+
+      /* Final source bytes to write to trailing partial word, if any. */
+      final = __insn_dblalign (a, b, src1);
+    }
+  else
+    {
+      /* Aligned copy. */
+
+      const word_t *__restrict src8 = (const word_t *) src1;
+
+      /* src8 and dst8 are both word-aligned. */
+      if (n >= CHIP_L2_LINE_SIZE ())
+        {
+          /* Copy until 'dst' is cache-line-aligned. */
+          for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
+               n -= sizeof (word_t))
+            *dst8++ = *src8++;
+
+          /* If copying to self, return.  The test is cheap enough
+             that we do it despite the fact that the memcpy() contract
+             doesn't require us to support overlapping dst and src.
+             This is the most common case of overlap, and any close
+             overlap will cause corruption due to the wh64 below.
+             This case is particularly important since the compiler
+             will emit memcpy() calls for aggregate copies even if it
+             can't prove that src != dst.  */
+          if (__builtin_expect (dst8 == src8, 0))
+            return dstv;
+
+          for (; n >= CHIP_L2_LINE_SIZE ();)
+            {
+              __insn_wh64 (dst8);
+
+              /* Prefetch and advance to next line to prefetch, but
+                 don't go past the end.  */
+              __insn_prefetch (prefetch);
+              prefetch += CHIP_L2_LINE_SIZE ();
+              prefetch = (prefetch > src1_end) ? prefetch :
+                (const char *) src8;
+
+              /* Copy an entire cache line.  Manually unrolled to
+                 avoid idiosyncracies of compiler unrolling.  */
+#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
+              COPY_WORD (0);
+              COPY_WORD (1);
+              COPY_WORD (2);
+              COPY_WORD (3);
+              COPY_WORD (4);
+              COPY_WORD (5);
+              COPY_WORD (6);
+              COPY_WORD (7);
+#if CHIP_L2_LINE_SIZE() != 64
+# error "Fix code that assumes particular L2 cache line size."
+#endif
+
+              dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
+              src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
+            }
+        }
+
+      for (; n >= sizeof (word_t); n -= sizeof (word_t))
+        *dst8++ = *src8++;
+
+      if (__builtin_expect (n == 0, 1))
+        return dstv;
+
+      final = *src8;
+    }
+
+  /* n != 0 if we get here.  Write out any trailing bytes. */
+  dst1 = (char *) dst8;
+#ifndef __BIG_ENDIAN__
+  if (n & 4)
+    {
+      *(uint32_t *) dst1 = final;
+      dst1 += 4;
+      final >>= 32;
+      n &= 3;
+    }
+  if (n & 2)
+    {
+      *(uint16_t *) dst1 = final;
+      dst1 += 2;
+      final >>= 16;
+      n &= 1;
+    }
+  if (n)
+    *(uint8_t *) dst1 = final;
+#else
+  if (n & 4)
+    {
+      *(uint32_t *) dst1 = final >> 32;
+      dst1 += 4;
+    }
+  else
+    {
+      final >>= 32;
+    }
+  if (n & 2)
+    {
+      *(uint16_t *) dst1 = final >> 16;
+      dst1 += 2;
+    }
+  else
+    {
+      final >>= 16;
+    }
+  if (n & 1)
+    *(uint8_t *) dst1 = final >> 8;
+#endif
+
+  return dstv;
+}
+weak_alias (__memcpy, memcpy)
+libc_hidden_builtin_def (memcpy)