about summary refs log tree commit diff
path: root/ports/sysdeps/alpha/memchr.c
diff options
context:
space:
mode:
Diffstat (limited to 'ports/sysdeps/alpha/memchr.c')
-rw-r--r--ports/sysdeps/alpha/memchr.c175
1 files changed, 175 insertions, 0 deletions
diff --git a/ports/sysdeps/alpha/memchr.c b/ports/sysdeps/alpha/memchr.c
new file mode 100644
index 0000000000..b19d8200c6
--- /dev/null
+++ b/ports/sysdeps/alpha/memchr.c
@@ -0,0 +1,175 @@
+/* Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <bp-sym.h>
+
+typedef unsigned long word;
+
+static inline word
+ldq_u(const void *s)
+{
+  return *(const word *)((word)s & -8);
+}
+
+#define unlikely(X)	__builtin_expect ((X), 0)
+#define prefetch(X)	__builtin_prefetch ((void *)(X), 0)
+
+#define cmpbeq0(X)	__builtin_alpha_cmpbge(0, (X))
+#define find(X, Y)	cmpbeq0 ((X) ^ (Y))
+
+/* Search no more than N bytes of S for C.  */
+
+void *
+__memchr (const void *s, int xc, size_t n)
+{
+  const word *s_align;
+  word t, current, found, mask, offset;
+
+  if (unlikely (n == 0))
+    return 0;
+
+  current = ldq_u (s);
+
+  /* Replicate low byte of XC into all bytes of C.  */
+  t = xc & 0xff;			/* 0000000c */
+  t = (t << 8) | t;			/* 000000cc */
+  t = (t << 16) | t;			/* 0000cccc */
+  const word c = (t << 32) | t;		/* cccccccc */
+
+  /* Align the source, and decrement the count by the number
+     of bytes searched in the first word.  */
+  s_align = (const word *)((word)s & -8);
+  n += ((word)s & 7);
+
+  /* Deal with misalignment in the first word for the comparison.  */
+  mask = (1ul << ((word)s & 7)) - 1;
+
+  /* If the entire string fits within one word, we may need masking
+     at both the front and the back of the string.  */
+  if (unlikely (n <= 8))
+    {
+      mask |= -1ul << n;
+      goto last_quad;
+    }
+
+  found = find (current, c) & ~mask;
+  if (unlikely (found))
+    goto found_it;
+
+  s_align++;
+  n -= 8;
+
+  /* If the block is sufficiently large, align to cacheline and prefetch.  */
+  if (unlikely (n >= 256))
+    {
+      /* Prefetch 3 cache lines beyond the one we're working on.  */
+      prefetch (s_align + 8);
+      prefetch (s_align + 16);
+      prefetch (s_align + 24);
+
+      while ((word)s_align & 63)
+	{
+	  current = *s_align;
+	  found = find (current, c);
+	  if (found)
+	    goto found_it;
+	  s_align++;
+	  n -= 8;
+	}
+
+	/* Within each cacheline, advance the load for the next word
+	   before the test for the previous word is complete.  This
+	   allows us to hide the 3 cycle L1 cache load latency.  We
+	   only perform this advance load within a cacheline to prevent
+	   reading across page boundary.  */
+#define CACHELINE_LOOP				\
+	do {					\
+	  word i, next = s_align[0];		\
+	  for (i = 0; i < 7; ++i)		\
+	    {					\
+	      current = next;			\
+	      next = s_align[1];		\
+	      found = find (current, c);	\
+	      if (unlikely (found))		\
+		goto found_it;			\
+	      s_align++;			\
+	    }					\
+	  current = next;			\
+	  found = find (current, c);		\
+	  if (unlikely (found))			\
+	    goto found_it;			\
+	  s_align++;				\
+	  n -= 64;				\
+	} while (0)
+      
+      /* While there's still lots more data to potentially be read,
+	 continue issuing prefetches for the 4th cacheline out.  */
+      while (n >= 256)
+	{
+	  prefetch (s_align + 24);
+	  CACHELINE_LOOP;
+	}
+
+      /* Up to 3 cache lines remaining.  Continue issuing advanced
+	 loads, but stop prefetching.  */
+      while (n >= 64)
+	CACHELINE_LOOP;
+
+      /* We may have exhausted the buffer.  */
+      if (n == 0)
+	return NULL;
+    }
+
+  /* Quadword aligned loop.  */
+  current = *s_align;
+  while (n > 8)
+    {
+      found = find (current, c);
+      if (unlikely (found))
+	goto found_it;
+      current = *++s_align;
+      n -= 8;
+    }
+
+  /* The last word may need masking at the tail of the compare.  */
+  mask = -1ul << n;
+ last_quad:
+  found = find (current, c) & ~mask;
+  if (found == 0)
+    return NULL;
+
+ found_it:
+#ifdef __alpha_cix__
+  offset = __builtin_alpha_cttz (found);
+#else
+  /* Extract LSB.  */
+  found &= -found;
+
+  /* Binary search for the LSB.  */
+  offset  = (found & 0x0f ? 0 : 4);
+  offset += (found & 0x33 ? 0 : 2);
+  offset += (found & 0x55 ? 0 : 1);
+#endif
+
+  return (void *)((word)s_align + offset);
+}
+
+#ifdef weak_alias
+weak_alias (__memchr, BP_SYM (memchr))
+#endif
+libc_hidden_builtin_def (memchr)