1 files changed, 621 insertions, 81 deletions
diff --git a/nscd/connections.c b/nscd/connections.c
index 53795bb3b9..2bd3bec5b0 100644
--- a/nscd/connections.c
+++ b/nscd/connections.c
@@ -18,6 +18,7 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
+#include <alloca.h>
 #include <assert.h>
 #include <atomic.h>
 #include <error.h>
@@ -32,6 +33,9 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <arpa/inet.h>
+#ifdef HAVE_EPOLL
+# include <sys/epoll.h>
+#endif
 #include <sys/mman.h>
 #include <sys/param.h>
 #include <sys/poll.h>
@@ -65,6 +69,8 @@ static gid_t *server_groups;
 #endif
 static int server_ngroups;
 
+static pthread_attr_t attr;
+
 static void begin_drop_privileges (void);
 static void finish_drop_privileges (void);
 
@@ -163,8 +169,10 @@ static struct database_dyn *const serv2db[LASTREQ] =
 #define CACHE_PRUNE_INTERVAL	15
 
 
-/* Number of threads to use.  */
+/* Initial number of threads to use.  */
 int nthreads = -1;
+/* Maximum number of threads to use.  */
+int max_nthreads = 32;
 
 /* Socket for incoming connections.  */
 static int sock;
@@ -434,6 +442,18 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
 	      }
 	  }
 
+	if (paranoia
+	    && ((dbs[cnt].wr_fd != -1
+		 && fcntl (dbs[cnt].wr_fd, F_SETFD, FD_CLOEXEC) == -1)
+		|| (dbs[cnt].ro_fd != -1
+		    && fcntl (dbs[cnt].ro_fd, F_SETFD, FD_CLOEXEC) == -1)))
+	  {
+	    dbg_log (_("\
+cannot set socket to close on exec: %s; disabling paranoia mode"),
+		     strerror (errno));
+	    paranoia = 0;
+	  }
+
 	if (dbs[cnt].head == NULL)
 	  {
 	    /* We do not use the persistent database.  Just
@@ -490,11 +510,22 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
       exit (1);
     }
 
-  /* We don't wait for data otherwise races between threads can get
-     them stuck on accept.  */
+  /* We don't want to get stuck on accept.  */
   int fl = fcntl (sock, F_GETFL);
-  if (fl != -1)
-    fcntl (sock, F_SETFL, fl | O_NONBLOCK);
+  if (fl == -1 || fcntl (sock, F_SETFL, fl | O_NONBLOCK) == -1)
+    {
+      dbg_log (_("cannot change socket to nonblocking mode: %s"),
+	       strerror (errno));
+      exit (1);
+    }
+
+  /* The descriptor needs to be closed on exec.  */
+  if (paranoia && fcntl (sock, F_SETFD, FD_CLOEXEC) == -1)
+    {
+      dbg_log (_("cannot set socket to close on exec: %s"),
+	       strerror (errno));
+      exit (1);
+    }
 
   /* Set permissions for the socket.  */
   chmod (_PATH_NSCDSOCKET, DEFFILEMODE);
@@ -785,91 +816,253 @@ cannot handle old request version %d; current version is %d"),
 }
 
 
+/* Restart the process.  */
+static void
+restart (void)
+{
+  /* First determine the parameters.  We do not use the parameters
+     passed to main() since in case nscd is started by running the
+     dynamic linker this will not work.  Yes, this is not the usual
+     case but nscd is part of glibc and we occasionally do this.  */
+  size_t buflen = 1024;
+  char *buf = alloca (buflen);
+  size_t readlen = 0;
+  int fd = open ("/proc/self/cmdline", O_RDONLY);
+  if (fd == -1)
+    {
+      dbg_log (_("\
+cannot open /proc/self/cmdline: %s; disabling paranoia mode"),
+	       strerror (errno));
+
+      paranoia = 0;
+      return;
+    }
+
+  while (1)
+    {
+      ssize_t n = TEMP_FAILURE_RETRY (read (fd, buf + readlen,
+					    buflen - readlen));
+      if (n == -1)
+	{
+	  dbg_log (_("\
+cannot open /proc/self/cmdline: %s; disabling paranoia mode"),
+		   strerror (errno));
+
+	  close (fd);
+	  paranoia = 0;
+	  return;
+	}
+
+      readlen += n;
+
+      if (readlen < buflen)
+	break;
+
+      /* We might have to extend the buffer.  */
+      size_t old_buflen = buflen;
+      char *newp = extend_alloca (buf, buflen, 2 * buflen);
+      buf = memmove (newp, buf, old_buflen);
+    }
+
+  close (fd);
+
+  /* Parse the command line.  Worst case scenario: every two
+     characters form one parameter (one character plus NUL).  */
+  char **argv = alloca ((readlen / 2 + 1) * sizeof (argv[0]));
+  int argc = 0;
+
+  char *cp = buf;
+  while (cp < buf + readlen)
+    {
+      argv[argc++] = cp;
+      cp = (char *) rawmemchr (cp, '\0') + 1;
+    }
+  argv[argc] = NULL;
+
+  /* Second, change back to the old user if we changed it.  */
+  if (server_user != NULL)
+    {
+      if (setuid (old_uid) != 0)
+	{
+	  dbg_log (_("\
+cannot change to old UID: %s; disabling paranoia mode"),
+		   strerror (errno));
+
+	  paranoia = 0;
+	  return;
+	}
+
+      if (setgid (old_gid) != 0)
+	{
+	  dbg_log (_("\
+cannot change to old GID: %s; disabling paranoia mode"),
+		   strerror (errno));
+
+	  setuid (server_uid);
+	  paranoia = 0;
+	  return;
+	}
+    }
+
+  /* Next change back to the old working directory.  */
+  if (chdir (oldcwd) == -1)
+    {
+      dbg_log (_("\
+cannot change to old working directory: %s; disabling paranoia mode"),
+	       strerror (errno));
+
+      if (server_user != NULL)
+	{
+	  setuid (server_uid);
+	  setgid (server_gid);
+	}
+      paranoia = 0;
+      return;
+    }
+
+  /* Synchronize memory.  */
+  for (int cnt = 0; cnt < lastdb; ++cnt)
+    {
+      /* Make sure nobody keeps using the database.  */
+      dbs[cnt].head->timestamp = 0;
+
+      if (dbs[cnt].persistent)
+	// XXX async OK?
+	msync (dbs[cnt].head, dbs[cnt].memsize, MS_ASYNC);
+    }
+
+  /* The preparations are done.  */
+  execv ("/proc/self/exe", argv);
+
+  /* If we come here, we will never be able to re-exec.  */
+  dbg_log (_("re-exec failed: %s; disabling paranoia mode"),
+	   strerror (errno));
+
+  if (server_user != NULL)
+    {
+      setuid (server_uid);
+      setgid (server_gid);
+    }
+  chdir ("/");
+  paranoia = 0;
+}
+
+
+/* List of file descriptors.  */
+struct fdlist
+{
+  int fd;
+  struct fdlist *next;
+};
+/* Memory allocated for the list.  */
+static struct fdlist *fdlist;
+/* List of currently ready-to-read file descriptors.  */
+static struct fdlist *readylist;
+
+/* Conditional variable and mutex to signal availability of entries in
+   READYLIST.  The condvar is initialized dynamically since we might
+   use a different clock depending on availability.  */
+static pthread_cond_t readylist_cond;
+static pthread_mutex_t readylist_lock = PTHREAD_MUTEX_INITIALIZER;
+
+/* The clock to use with the condvar.  */
+static clockid_t timeout_clock = CLOCK_REALTIME;
+
+/* Number of threads ready to handle the READYLIST.  */
+static unsigned long int nready;
+
+
 /* This is the main loop.  It is replicated in different threads but the
    `poll' call makes sure only one thread handles an incoming connection.  */
 static void *
 __attribute__ ((__noreturn__))
 nscd_run (void *p)
 {
-  long int my_number = (long int) p;
-  struct pollfd conn;
-  int run_prune = my_number < lastdb && dbs[my_number].enabled;
-  time_t next_prune = run_prune ? time (NULL) + CACHE_PRUNE_INTERVAL : 0;
-  static unsigned long int nready;
+  const long int my_number = (long int) p;
+  const int run_prune = my_number < lastdb && dbs[my_number].enabled;
+  struct timespec prune_ts;
+  int to = 0;
+  char buf[256];
 
   if (run_prune)
-    setup_thread (&dbs[my_number]);
+    {
+      setup_thread (&dbs[my_number]);
 
-  conn.fd = sock;
-  conn.events = POLLRDNORM;
+      /* We are running.  */
+      dbs[my_number].head->timestamp = time (NULL);
 
-  while (1)
-    {
-      int nr;
-      time_t now = 0;
+      if (clock_gettime (timeout_clock, &prune_ts) == -1)
+	/* Should never happen.  */
+	abort ();
 
-      /* One more thread available.  */
-      atomic_increment (&nready);
+      /* Compute timeout time.  */
+      prune_ts.tv_sec += CACHE_PRUNE_INTERVAL;
+    }
+
+  /* Initial locking.  */
+  pthread_mutex_lock (&readylist_lock);
 
-    no_conn:
-      do
+  /* One more thread available.  */
+  ++nready;
+
+  while (1)
+    {
+      while (readylist == NULL)
 	{
-	  int timeout = -1;
 	  if (run_prune)
 	    {
-	      /* NB: we do not flush the timestamp update using msync since
-		 this value doesnot matter on disk.  */
-	      dbs[my_number].head->timestamp = now = time (NULL);
-	      timeout = now < next_prune ? 1000 * (next_prune - now) : 0;
+	      /* Wait, but not forever.  */
+	      to = pthread_cond_timedwait (&readylist_cond, &readylist_lock,
+					   &prune_ts);
+
+	      /* If we were woken and there is no work to be done,
+		 just start pruning.  */
+	      if (readylist == NULL && to == ETIMEDOUT)
+		{
+		  --nready;
+		  pthread_mutex_unlock (&readylist_lock);
+		  goto only_prune;
+		}
 	    }
+	  else
+	    /* No need to timeout.  */
+	    pthread_cond_wait (&readylist_cond, &readylist_lock);
+	}
 
-	  nr = poll (&conn, 1, timeout);
+      struct fdlist *it = readylist->next;
+      if (readylist->next == readylist)
+	/* Just one entry on the list.  */
+	readylist = NULL;
+      else
+	readylist->next = it->next;
 
-	  if (nr == 0)
-	    {
-	      /* The `poll' call timed out.  It's time to clean up the
-		 cache.  */
-	      atomic_decrement (&nready);
-	      assert (my_number < lastdb);
-	      prune_cache (&dbs[my_number], time(NULL));
-	      now = time (NULL);
-	      next_prune = now + CACHE_PRUNE_INTERVAL;
-
-	      goto try_get;
-	    }
-	}
-      while ((conn.revents & POLLRDNORM) == 0);
+      /* Extract the information and mark the record ready to be used
+	 again.  */
+      int fd = it->fd;
+      it->next = NULL;
 
-    got_data:;
-      /* We have a new incoming connection.  Accept the connection.  */
-      int fd = TEMP_FAILURE_RETRY (accept (conn.fd, NULL, NULL));
-      request_header req;
-      char buf[256];
-      uid_t uid = -1;
-#ifdef SO_PEERCRED
-      pid_t pid = 0;
-#endif
+      /* One more thread available.  */
+      --nready;
 
-      if (__builtin_expect (fd, 0) < 0)
-	{
-	  if (errno != EAGAIN && errno != EWOULDBLOCK)
-	    dbg_log (_("while accepting connection: %s"),
-		     strerror_r (errno, buf, sizeof (buf)));
-	  goto no_conn;
-	}
+      /* We are done with the list.  */
+      pthread_mutex_unlock (&readylist_lock);
 
-      /* This thread is busy.  */
-      atomic_decrement (&nready);
+      /* We do not want to block on a short read or so.  */
+      int fl = fcntl (fd, F_GETFL);
+      if (fl == -1 || fcntl (fd, F_SETFL, fl | O_NONBLOCK) == -1)
+	goto close_and_out;
 
       /* Now read the request.  */
+      request_header req;
       if (__builtin_expect (TEMP_FAILURE_RETRY (read (fd, &req, sizeof (req)))
 			    != sizeof (req), 0))
 	{
+	  /* We failed to read data.  Note that this also might mean we
+	     failed because we would have blocked.  */
 	  if (debug_level > 0)
 	    dbg_log (_("short read while reading request: %s"),
 		     strerror_r (errno, buf, sizeof (buf)));
-	  close (fd);
-	  continue;
+	  goto close_and_out;
 	}
 
       /* Check whether this is a valid request type.  */
@@ -878,7 +1071,10 @@ nscd_run (void *p)
 
       /* Some systems have no SO_PEERCRED implementation.  They don't
 	 care about security so we don't as well.  */
+      uid_t uid = -1;
 #ifdef SO_PEERCRED
+      pid_t pid = 0;
+
       if (secure_in_use)
 	{
 	  struct ucred caller;
@@ -909,8 +1105,9 @@ nscd_run (void *p)
 
       /* It should not be possible to crash the nscd with a silly
 	 request (i.e., a terribly large key).  We limit the size to 1kb.  */
+#define MAXKEYLEN 1024
       if (__builtin_expect (req.key_len, 1) < 0
-	  || __builtin_expect (req.key_len, 1) > 1024)
+	  || __builtin_expect (req.key_len, 1) > MAXKEYLEN)
 	{
 	  if (debug_level > 0)
 	    dbg_log (_("key length in request too long: %d"), req.key_len);
@@ -918,17 +1115,17 @@ nscd_run (void *p)
       else
 	{
 	  /* Get the key.  */
-	  char keybuf[req.key_len];
+	  char keybuf[MAXKEYLEN];
 
 	  if (__builtin_expect (TEMP_FAILURE_RETRY (read (fd, keybuf,
 							  req.key_len))
 				!= req.key_len, 0))
 	    {
+	      /* Again, this can also mean we would have blocked.  */
 	      if (debug_level > 0)
 		dbg_log (_("short read while reading request key: %s"),
 			 strerror_r (errno, buf, sizeof (buf)));
-	      close (fd);
-	      continue;
+	      goto close_and_out;
 	    }
 
 	  if (__builtin_expect (debug_level, 0) > 0)
@@ -952,44 +1149,380 @@ handle_request: request received (Version = %d)"), req.version);
       /* We are done.  */
       close (fd);
 
-      /* Just determine whether any data is present.  We do this to
-	 measure whether clients are queued up.  */
-    try_get:
-      nr = poll (&conn, 1, 0);
-      if (nr != 0)
+      /* Check whether we should be pruning the cache. */
+      assert (run_prune || to == 0);
+      if (to == ETIMEDOUT)
 	{
-	  if (nready == 0)
-	    ++client_queued;
+	only_prune:
+	  /* The pthread_cond_timedwait() call timed out.  It is time
+		 to clean up the cache.  */
+	  assert (my_number < lastdb);
+	  prune_cache (&dbs[my_number],
+		       prune_ts.tv_sec + (prune_ts.tv_nsec >= 500000000));
+
+	  if (clock_gettime (timeout_clock, &prune_ts) == -1)
+	    /* Should never happen.  */
+	    abort ();
+
+	  /* Compute next timeout time.  */
+	  prune_ts.tv_sec += CACHE_PRUNE_INTERVAL;
+
+	  /* In case the list is emtpy we do not want to run the prune
+	     code right away again.  */
+	  to = 0;
+	}
+
+      /* Re-locking.  */
+      pthread_mutex_lock (&readylist_lock);
+
+      /* One more thread available.  */
+      ++nready;
+    }
+}
+
 
-	  atomic_increment (&nready);
+static unsigned int nconns;
+
+static void
+fd_ready (int fd)
+{
+  pthread_mutex_lock (&readylist_lock);
+
+  /* Find an empty entry in FDLIST.  */
+  size_t inner;
+  for (inner = 0; inner < nconns; ++inner)
+    if (fdlist[inner].next == NULL)
+      break;
+  assert (inner < nconns);
 
-	  goto got_data;
+  fdlist[inner].fd = fd;
+
+  if (readylist == NULL)
+    readylist = fdlist[inner].next = &fdlist[inner];
+  else
+    {
+      fdlist[inner].next = readylist->next;
+      readylist = readylist->next = &fdlist[inner];
+    }
+
+  bool do_signal = true;
+  if (__builtin_expect (nready == 0, 0))
+    {
+      ++client_queued;
+      do_signal = false;
+
+      /* Try to start another thread to help out.  */
+      pthread_t th;
+      if (nthreads < max_nthreads
+	  && pthread_create (&th, &attr, nscd_run,
+			     (void *) (long int) nthreads) == 0)
+	{
+	  /* We got another thread.  */
+	  ++nthreads;
+	  /* The new thread might new a kick.  */
+	  do_signal = true;
 	}
+
     }
+
+  pthread_mutex_unlock (&readylist_lock);
+
+  /* Tell one of the worker threads there is work to do.  */
+  if (do_signal)
+    pthread_cond_signal (&readylist_cond);
+}
+
+
+/* Check whether restarting should happen.  */
+static inline int
+restart_p (time_t now)
+{
+  return (paranoia && readylist == NULL && nready == nthreads
+	  && now >= restart_time);
 }
 
 
+/* Array for times a connection was accepted.  */
+static time_t *starttime;
+
+
+static void
+__attribute__ ((__noreturn__))
+main_loop_poll (void)
+{
+  struct pollfd *conns = (struct pollfd *) xmalloc (nconns
+						    * sizeof (conns[0]));
+
+  conns[0].fd = sock;
+  conns[0].events = POLLRDNORM;
+  size_t nused = 1;
+  size_t firstfree = 1;
+
+  while (1)
+    {
+      /* Wait for any event.  We wait at most a couple of seconds so
+	 that we can check whether we should close any of the accepted
+	 connections since we have not received a request.  */
+#define MAX_ACCEPT_TIMEOUT 30
+#define MIN_ACCEPT_TIMEOUT 5
+#define MAIN_THREAD_TIMEOUT \
+  (MAX_ACCEPT_TIMEOUT * 1000						      \
+   - ((MAX_ACCEPT_TIMEOUT - MIN_ACCEPT_TIMEOUT) * 1000 * nused) / (2 * nconns))
+
+      int n = poll (conns, nused, MAIN_THREAD_TIMEOUT);
+
+      time_t now = time (NULL);
+
+      /* If there is a descriptor ready for reading or there is a new
+	 connection, process this now.  */
+      if (n > 0)
+	{
+	  if (conns[0].revents != 0)
+	    {
+	      /* We have a new incoming connection.  Accept the connection.  */
+	      int fd = TEMP_FAILURE_RETRY (accept (sock, NULL, NULL));
+
+	      /* use the descriptor if we have not reached the limit.  */
+	      if (fd >= 0 && firstfree < nconns)
+		{
+		  conns[firstfree].fd = fd;
+		  conns[firstfree].events = POLLRDNORM;
+		  starttime[firstfree] = now;
+		  if (firstfree >= nused)
+		    nused = firstfree + 1;
+
+		  do
+		    ++firstfree;
+		  while (firstfree < nused && conns[firstfree].fd != -1);
+		}
+
+	      --n;
+	    }
+
+	  for (size_t cnt = 1; cnt < nused && n > 0; ++cnt)
+	    if (conns[cnt].revents != 0)
+	      {
+		fd_ready (conns[cnt].fd);
+
+		/* Clean up the CONNS array.  */
+		conns[cnt].fd = -1;
+		if (cnt < firstfree)
+		  firstfree = cnt;
+		if (cnt == nused - 1)
+		  do
+		    --nused;
+		  while (conns[nused - 1].fd == -1);
+
+		--n;
+	      }
+	}
+
+      /* Now find entries which have timed out.  */
+      assert (nused > 0);
+
+      /* We make the timeout length depend on the number of file
+	 descriptors currently used.  */
+#define ACCEPT_TIMEOUT \
+  (MAX_ACCEPT_TIMEOUT							      \
+   - ((MAX_ACCEPT_TIMEOUT - MIN_ACCEPT_TIMEOUT) * nused) / nconns)
+      time_t laststart = now - ACCEPT_TIMEOUT;
+
+      for (size_t cnt = nused - 1; cnt > 0; --cnt)
+	{
+	  if (conns[cnt].fd != -1 && starttime[cnt] < laststart)
+	    {
+	      /* Remove the entry, it timed out.  */
+	      (void) close (conns[cnt].fd);
+	      conns[cnt].fd = -1;
+
+	      if (cnt < firstfree)
+		firstfree = cnt;
+	      if (cnt == nused - 1)
+		do
+		  --nused;
+		while (conns[nused - 1].fd == -1);
+	    }
+	}
+
+      if (restart_p (now))
+	restart ();
+    }
+}
+
+
+#ifdef HAVE_EPOLL
+static void
+main_loop_epoll (int efd)
+{
+  struct epoll_event ev = { 0, };
+  int nused = 1;
+  size_t highest = 0;
+
+  /* Add the socket.  */
+  ev.events = EPOLLRDNORM;
+  ev.data.fd = sock;
+  if (epoll_ctl (efd, EPOLL_CTL_ADD, sock, &ev) == -1)
+    /* We cannot use epoll.  */
+    return;
+
+  while (1)
+    {
+      struct epoll_event revs[100];
+# define nrevs (sizeof (revs) / sizeof (revs[0]))
+
+      int n = epoll_wait (efd, revs, nrevs, MAIN_THREAD_TIMEOUT);
+
+      time_t now = time (NULL);
+
+      for (int cnt = 0; cnt < n; ++cnt)
+	if (revs[cnt].data.fd == sock)
+	  {
+	    /* A new connection.  */
+	    int fd = TEMP_FAILURE_RETRY (accept (sock, NULL, NULL));
+
+	    if (fd >= 0)
+	      {
+		/* Try to add the  new descriptor.  */
+		ev.data.fd = fd;
+		if (fd >= nconns
+		    || epoll_ctl (efd, EPOLL_CTL_ADD, fd, &ev) == -1)
+		  /* The descriptor is too large or something went
+		     wrong.  Close the descriptor.  */
+		  close (fd);
+		else
+		  {
+		    /* Remember when we accepted the connection.  */
+		    starttime[fd] = now;
+
+		    if (fd > highest)
+		      highest = fd;
+
+		    ++nused;
+		  }
+	      }
+	  }
+	else
+	  {
+	    /* Remove the descriptor from the epoll descriptor.  */
+	    struct epoll_event ev = { 0, };
+	    (void) epoll_ctl (efd, EPOLL_CTL_DEL, revs[cnt].data.fd, &ev);
+
+	    /* Get a worked to handle the request.  */
+	    fd_ready (revs[cnt].data.fd);
+
+	    /* Reset the time.  */
+	    starttime[revs[cnt].data.fd] = 0;
+	    if (revs[cnt].data.fd == highest)
+	      do
+		--highest;
+	      while (highest > 0 && starttime[highest] == 0);
+
+	    --nused;
+	  }
+
+      /*  Now look for descriptors for accepted connections which have
+	  no reply in too long of a time.  */
+      time_t laststart = now - ACCEPT_TIMEOUT;
+      for (int cnt = highest; cnt > STDERR_FILENO; --cnt)
+	if (cnt != sock && starttime[cnt] != 0 && starttime[cnt] < laststart)
+	  {
+	    /* We are waiting for this one for too long.  Close it.  */
+	    struct epoll_event ev = {0, };
+	    (void) epoll_ctl (efd, EPOLL_CTL_DEL, cnt, &ev);
+
+	    (void) close (cnt);
+
+	    starttime[cnt] = 0;
+	    if (cnt == highest)
+	      --highest;
+	  }
+	else if (cnt != sock && starttime[cnt] == 0 && cnt == highest)
+	  --highest;
+
+      if (restart_p (now))
+	restart ();
+    }
+}
+#endif
+
+
 /* Start all the threads we want.  The initial process is thread no. 1.  */
 void
 start_threads (void)
 {
-  long int i;
-  pthread_attr_t attr;
-  pthread_t th;
+  /* Initialize the conditional variable we will use.  The only
+     non-standard attribute we might use is the clock selection.  */
+  pthread_condattr_t condattr;
+  pthread_condattr_init (&condattr);
+
+#if _POSIX_CLOCK_SELECTION >= 0 && _POSIX_MONOTONIC_CLOCK >= 0
+  /* Determine whether the monotonous clock is available.  */
+  struct timespec dummy;
+  if (clock_getres (CLOCK_MONOTONIC, &dummy) == 0
+      && pthread_condattr_setclock (&condattr, CLOCK_MONOTONIC) == 0)
+    timeout_clock = CLOCK_MONOTONIC;
+#endif
+
+  pthread_cond_init (&readylist_cond, &condattr);
+  pthread_condattr_destroy (&condattr);
 
+
+  /* Create the attribute for the threads.  They are all created
+     detached.  */
   pthread_attr_init (&attr);
   pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
+  /* Use 1MB stacks, twice as much for 64-bit architectures.  */
+  pthread_attr_setstacksize (&attr, 1024 * 1024 * (sizeof (void *) / 4));
 
   /* We allow less than LASTDB threads only for debugging.  */
   if (debug_level == 0)
     nthreads = MAX (nthreads, lastdb);
 
-  for (i = 1; i < nthreads; ++i)
-    pthread_create (&th, &attr, nscd_run, (void *) i);
+  int nfailed = 0;
+  for (long int i = 0; i < nthreads; ++i)
+    {
+      pthread_t th;
+      if (pthread_create (&th, &attr, nscd_run, (void *) (i - nfailed)) != 0)
+	++nfailed;
+    }
+  if (nthreads - nfailed < lastdb)
+    {
+      /* We could not start enough threads.  */
+      dbg_log (_("could only start %d threads; terminating"),
+	       nthreads - nfailed);
+      exit (1);
+    }
 
-  pthread_attr_destroy (&attr);
+  /* Determine how much room for descriptors we should initially
+     allocate.  This might need to change later if we cap the number
+     with MAXCONN.  */
+  const long int nfds = sysconf (_SC_OPEN_MAX);
+#define MINCONN 32
+#define MAXCONN 16384
+  if (nfds == -1 || nfds > MAXCONN)
+    nconns = MAXCONN;
+  else if (nfds < MINCONN)
+    nconns = MINCONN;
+  else
+    nconns = nfds;
+
+  /* We need memory to pass descriptors on to the worker threads.  */
+  fdlist = (struct fdlist *) xcalloc (nconns, sizeof (fdlist[0]));
+  /* Array to keep track when connection was accepted.  */
+  starttime = (time_t *) xcalloc (nconns, sizeof (starttime[0]));
+
+  /* In the main thread we execute the loop which handles incoming
+     connections.  */
+#ifdef HAVE_EPOLL
+  int efd = epoll_create (100);
+  if (efd != -1)
+    {
+      main_loop_epoll (efd);
+      close (efd);
+    }
+#endif
 
-  nscd_run ((void *) 0);
+  main_loop_poll ();
 }
 
 /* Look up the uid, gid, and supplementary groups to run nscd as. When
@@ -1010,6 +1543,13 @@ begin_drop_privileges (void)
   server_uid = pwd->pw_uid;
   server_gid = pwd->pw_gid;
 
+  /* Save the old UID/GID if we have to change back.  */
+  if (paranoia)
+    {
+      old_uid = getuid ();
+      old_gid = getgid ();
+    }
+
   if (getgrouplist (server_user, server_gid, NULL, &server_ngroups) == 0)
     {
       /* This really must never happen.  */