about summary refs log tree commit diff
diff options
context:
space:
mode:
authorLaurent Bercot <ska-skaware@skarnet.org>2023-06-14 10:45:37 +0000
committerLaurent Bercot <ska@appnovation.com>2023-06-14 10:45:37 +0000
commit60c1489b6fc2a3f8bc6054facc85d099bd83ac5e (patch)
tree3eca0ced64ed56cf8aa61dedc498b1b95233dc5d
parentc187bc7686ee71aa95a0a01d3bffedd6914b3c00 (diff)
downloads6-60c1489b6fc2a3f8bc6054facc85d099bd83ac5e.tar.gz
s6-60c1489b6fc2a3f8bc6054facc85d099bd83ac5e.tar.xz
s6-60c1489b6fc2a3f8bc6054facc85d099bd83ac5e.zip
New and improved s6-svscan
Signed-off-by: Laurent Bercot <ska@appnovation.com>
-rw-r--r--src/supervision/s6-svscan.c807
1 files changed, 470 insertions, 337 deletions
diff --git a/src/supervision/s6-svscan.c b/src/supervision/s6-svscan.c
index 4334e64..91c9f14 100644
--- a/src/supervision/s6-svscan.c
+++ b/src/supervision/s6-svscan.c
@@ -9,22 +9,30 @@
 #include <signal.h>
 
 #include <skalibs/posixplz.h>
+#include <skalibs/uint32.h>
 #include <skalibs/allreadwrite.h>
 #include <skalibs/sgetopt.h>
 #include <skalibs/types.h>
 #include <skalibs/strerr.h>
 #include <skalibs/tai.h>
 #include <skalibs/iopause.h>
+#include <skalibs/devino.h>
 #include <skalibs/djbunix.h>
 #include <skalibs/direntry.h>
 #include <skalibs/sig.h>
 #include <skalibs/selfpipe.h>
 #include <skalibs/exec.h>
+#include <skalibs/bitarray.h>
+#include <skalibs/genset.h>
+#include <skalibs/avltreen.h>
+#include <skalibs/lolstdio.h>
 
 #include <s6/config.h>
 #include <s6/supervise.h>
 
-#define USAGE "s6-svscan [ -c maxservices ] [ -t timeout ] [ -d notif ] [ -X consoleholder ] [ dir ]"
+#include <skalibs/posixishard.h>
+
+#define USAGE "s6-svscan [ -c services_max | -C services_max ] [ -L name_max ] [ -t timeout ] [ -d notif ] [ -X consoleholder ] [ dir ]"
 #define dieusage() strerr_dieusage(100, USAGE)
 
 #define CTL S6_SVSCAN_CTLDIR "/control"
@@ -35,35 +43,79 @@
 #define SIGNAL_PROG_LEN (sizeof(SIGNAL_PROG) - 1)
 #define SPECIAL_LOGGER_SERVICE "s6-svscan-log"
 
-#define DIR_RETRY_TIMEOUT 3
-#define CHECK_RETRY_TIMEOUT 4
+typedef struct service_s service, *service_ref ;
+struct service_s
+{
+  devino devino ;
+  pid_t pid ;
+  tain start ;
+  int p ;
+  uint32_t peer ;
+} ;
 
-struct svinfo_s
+struct flags_s
 {
-  dev_t dev ;
-  ino_t ino ;
-  tain restartafter[2] ;
-  pid_t pid[2] ;
-  int p[2] ;
-  unsigned int flagactive : 1 ;
-  unsigned int flaglog : 1 ;
-  unsigned int flagspecial : 1 ;
+  uint8_t cont : 1 ;
+  uint8_t waitall : 1 ;
 } ;
 
-static struct svinfo_s *services ;
-static unsigned int max = 500 ;
-static unsigned int n = 0 ;
-static tain deadline, defaulttimeout ;
-static int wantreap = 1 ;
-static int wantscan = 1 ;
-static unsigned int wantkill = 0 ;
-static int cont = 1 ;
-static int waitall = 1 ;
-static int consoleholder = -1 ;
+static unsigned int consoleholder = 0 ;
+static struct flags_s flags = { .cont = 1, .waitall = 0 } ;
+
+static uint32_t namemax = 251 ;
+static char *names ;
+#define NAME(i) (names + (i) * (namemax + 5))
+
+static genset *services ;
+#define SERVICE(i) genset_p(service, services, (i))
+static uint32_t max = 1000 ;
+static uint32_t special ;
+
+static avltreen *by_pid ;
+static avltreen *by_devino ;
+static char *active ;
+
+static tain scan_deadline = TAIN_EPOCH ;
+static tain start_deadline = TAIN_INFINITE ;
+static tain scantto = TAIN_INFINITE_RELATIVE ;
+
+
+ /* Tree management */
+
+static void *bydevino_dtok (uint32_t d, void *aux)
+{
+  genset *g = aux ;
+  return &genset_p(service, g, d)->devino ;
+}
+
+static int bydevino_cmp (void const *a, void const *b, void *aux)
+{
+  (void)aux ;
+  LOLDEBUG("bydevino_cmp: (%llu, %llu) vs (%llu, %llu)", (unsigned long long)((devino const *)a)->dev, (unsigned long long)((devino const *)a)->ino, (unsigned long long)((devino const *)b)->dev, (unsigned long long)((devino const *)b)->ino) ;
+  return devino_cmp(a, b) ;
+}
+
+static void *bypid_dtok (uint32_t d, void *aux)
+{
+  genset *g = aux ;
+  return &genset_p(service, g, d)->pid ;
+}
+
+static int bypid_cmp (void const *a, void const *b, void *aux)
+{
+  (void)aux ;
+  pid_t const *aa = a ;
+  pid_t const *bb = b ;
+  LOLDEBUG("bypid_cmp: %llu vs %llu", (unsigned long long)*aa, (unsigned long long)*bb) ;
+  return *aa < *bb ? -1 : *aa > *bb ;
+}
+
+
+ /* On-exit utility */
 
 static void restore_console (void)
 {
-  if (consoleholder >= 0)
+  if (consoleholder)
   {
     fd_move(2, consoleholder) ;
     if (fd_copy(1, 2) < 0) strerr_warnwu1sys("restore stdout") ;
@@ -77,8 +129,7 @@ static void panicnosp (char const *errmsg)
   strerr_warnwu1sys(errmsg) ;
   strerr_warnw2x("executing into ", eargv[0]) ;
   execv(eargv[0], (char *const *)eargv) ;
- /* and if that exec fails, screw it and just die */
-  strerr_dieexec(111, eargv[0]) ;
+  strerr_dieexec(errno == ENOENT ? 127 : 126, eargv[0]) ;
 }
 
 static void panic (char const *) gccattr_noreturn ;
@@ -91,33 +142,19 @@ static void panic (char const *errmsg)
   panicnosp(errmsg) ;
 }
 
-static void killthem (void)
+static int close_pipes_iter (void *data, void *aux)
 {
-  unsigned int i = 0 ;
-  if (!wantkill) return ;
-  for (; i < n ; i++)
-  {
-    if (!(wantkill & 1) && services[i].flagactive) continue ;
-    if (services[i].pid[0])
-      kill(services[i].pid[0], (wantkill & (2 << services[i].flagspecial)) ? SIGTERM : SIGHUP) ;
-    if (services[i].flaglog && services[i].pid[1])
-      kill(services[i].pid[1], (wantkill & 4) ? SIGTERM : SIGHUP) ;
-  }
-  wantkill = 0 ;
+  service *sv = data ;
+  LOLDEBUG("close_pipes_iter for %u: %d", sv - SERVICE(0), sv->p) ;
+  if (sv->p >= 0) close(sv->p) ;
+  (void)aux ;
+  return 1 ;
 }
 
-static inline void closethem (void)
+static inline void close_pipes (void)
 {
-  int gotspecial = 0 ;
-  unsigned int i = 0 ;
-  for (; i < n ; i++)
-    if (services[i].flagspecial) gotspecial = 1 ;
-    else if (services[i].flaglog)
-    {
-      if (services[i].p[1] >= 0) close(services[i].p[1]) ;
-      if (services[i].p[0] >= 0) close(services[i].p[0]) ;
-    }
-  if (gotspecial)
+  genset_iter(services, &close_pipes_iter, 0) ;
+  if (special < max)
   {
     close(1) ;
     if (open("/dev/null", O_WRONLY) < 0)
@@ -127,58 +164,68 @@ static inline void closethem (void)
 
 static inline void waitthem (void)
 {
-  unsigned int m = 0 ;
-  unsigned int i = 0 ;
-  pid_t pids[n << 1] ;
-  for (; i < n ; i++)
+  while (avltreen_len(by_pid))
   {
-    if (services[i].pid[0])
-      pids[m++] = services[i].pid[0] ;
-    if (services[i].flaglog && services[i].pid[1])
-      pids[m++] = services[i].pid[1] ;
+    int wstat ;
+    pid_t pid = wait_nointr(&wstat) ;
+    if (pid < 0)
+    {
+      strerr_warnwu1sys("wait for all s6-supervise processes") ;
+      break ;
+    }
+    avltreen_delete(by_pid, &pid) ;
   }
-  if (!waitn(pids, m))
-    strerr_warnwu1sys("wait for all s6-supervise processes") ;
 }
 
-static inline void chld (void)
+
+ /* Misc utility */
+
+static inline int is_logger (uint32_t i)
+{
+  return !!strchr(NAME(i), '/') ;
+}
+
+ /* Triggered actions: config */
+
+static inline void chld (unsigned int *what)
 {
-  wantreap = 1 ;
+  *what |= 8 ;
 }
 
 static inline void alrm (void)
 {
-  wantscan = 1 ;
+  tain_copynow(&scan_deadline) ;
 }
 
 static inline void abrt (void)
 {
-  cont = 0 ;
-  waitall = 0 ;
+  flags.cont = 0 ;
+  flags.waitall = 0 ;
 }
 
-static void hup (void)
+static void hup (unsigned int *what)
 {
-  wantkill = 2 ;
-  wantscan = 1 ;
+  *what |= 2 ;
+  tain_copynow(&scan_deadline) ;
 }
 
-static void term (void)
+static void term (unsigned int *what)
 {
-  cont = 0 ;
-  waitall = 1 ;
-  wantkill = 3 ;
+  flags.cont = 0 ;
+  flags.waitall = 1 ;
+  *what |= 3 ;
 }
 
-static void quit (void)
+static void quit (unsigned int *what)
 {
-  cont = 0 ;
-  waitall = 1 ;
-  wantkill = 7 ;
+  flags.cont = 0 ;
+  flags.waitall = 1 ;
+  *what |= 7 ;
 }
 
-static void handle_signals (void)
+static void handle_signals (unsigned int *what)
 {
+  LOLDEBUG("handle_signals") ;
   for (;;)
   {
     int sig = selfpipe_read() ;
@@ -186,7 +233,7 @@ static void handle_signals (void)
     {
       case -1 : panic("selfpipe_read") ;
       case 0 : return ;
-      case SIGCHLD : chld() ; break ;
+      case SIGCHLD : chld(what) ; break ;
       case SIGALRM : alrm() ; break ;
       case SIGABRT : abrt() ; break ;
       default :
@@ -202,10 +249,10 @@ static void handle_signals (void)
           if (errno != ENOENT) strerr_warnwu2sys("spawn ", newargv[0]) ;
           switch (sig)
           {
-            case SIGHUP : hup() ; break ;
+            case SIGHUP : hup(what) ; break ;
             case SIGINT :
-            case SIGTERM : term() ; break ;
-            case SIGQUIT : quit() ; break ;
+            case SIGTERM : term(what) ; break ;
+            case SIGQUIT : quit(what) ; break ;
           }
         }
       }
@@ -213,8 +260,9 @@ static void handle_signals (void)
   }
 }
 
-static void handle_control (int fd)
+static void handle_control (int fd, unsigned int *what)
 {
+  LOLDEBUG("handle_control") ;
   for (;;)
   {
     char c ;
@@ -223,15 +271,15 @@ static void handle_control (int fd)
     else if (!r) break ;
     else switch (c)
     {
-      case 'z' : chld() ; break ;
+      case 'z' : chld(what) ; break ;
       case 'a' : alrm() ; break ;
       case 'b' : abrt() ; break ;
-      case 'h' : hup() ; break ;
+      case 'h' : hup(what) ; break ;
       case 'i' :
-      case 't' : term() ; break ;
-      case 'q' : quit() ; break ;
-      case 'n' : wantkill = 2 ; break ;
-      case 'N' : wantkill = 6 ; break ;
+      case 't' : term(what) ; break ;
+      case 'q' : quit(what) ; break ;
+      case 'n' : *what |= 2 ; break ;
+      case 'N' : *what |= 6 ; break ;
       default :
       {
         char s[2] = { c, 0 } ;
@@ -242,264 +290,318 @@ static void handle_control (int fd)
 }
 
 
-/* First essential function: the reaper.
-   s6-svscan must wait() for all children,
-   including ones it doesn't know it has.
-   Dead active services are flagged to be restarted in 1 second. */
+ /* Triggered action: killer */
 
-static void reap (void)
+static int killthem_iter (void *data, void *aux)
 {
-  tain nextscan ;
-  if (!wantreap) return ;
-  wantreap = 0 ;
-  tain_addsec_g(&nextscan, 1) ;
-  for (;;)
+  service *sv = data ;
+  unsigned int *what = aux ;
+  uint32_t i = sv - SERVICE(0) ;
+  if ((*what & 1 || !bitarray_peek(active, i)) && sv->pid)
   {
-    int wstat ;
-    pid_t r = wait_nohang(&wstat) ;
-    if (r < 0)
-      if (errno != ECHILD) panic("wait_nohang") ;
-      else break ;
-    else if (!r) break ;
-    else
-    {
-      unsigned int i = 0 ;
-      for (; i < n ; i++)
-      {
-        if (services[i].pid[0] == r)
-        {
-          services[i].pid[0] = 0 ;
-          services[i].restartafter[0] = nextscan ;
-          break ;
-        }
-        else if (services[i].pid[1] == r)
-        {
-          services[i].pid[1] = 0 ;
-          services[i].restartafter[1] = nextscan ;
-          break ;
-        }
-      }
-      if (i == n) continue ;
-      if (services[i].flagactive)
-      {
-        if (tain_less(&nextscan, &deadline)) deadline = nextscan ;
-      }
-      else
-      {
-        if (services[i].flaglog)
-        {
- /*
-    BLACK MAGIC:
-     - we need to close the pipe early:
-       * as soon as the writer exits so the logger can exit on EOF
-       * or as soon as the logger exits so the writer can crash on EPIPE
-     - but if the same service gets reactivated before the second
-       supervise process exits, ouch: we've lost the pipe
-     - so we can't reuse the same service even if it gets reactivated
-     - so we're marking a dying service with a closed pipe
-     - if the scanner sees a service with p[0] = -1 it won't flag
-       it as active (and won't restart the dead supervise)
-     - but if the service gets reactivated we want it to restart
-       as soon as the 2nd supervise process dies
-     - so the scanner marks such a process with p[0] = -2
-     - and the reaper triggers a scan when it finds a -2.
- */
-          if (services[i].p[0] >= 0)
-          {
-            fd_close(services[i].p[1]) ; services[i].p[1] = -1 ;
-            fd_close(services[i].p[0]) ; services[i].p[0] = -1 ;
-          }
-          else if (services[i].p[0] == -2) wantscan = 1 ;
-        }
-        if (!services[i].pid[0] && (!services[i].flaglog || !services[i].pid[1]))
-          services[i] = services[--n] ;
-      }
-    }
+    LOLDEBUG("killthem: service %u: sending %s to pid %llu", i, *what & (2 << (i == special || is_logger(i))) ? "SIGTERM" : "SIGHUP", (unsigned long long)sv->pid) ;
+    kill(sv->pid, *what & (2 << (i == special || is_logger(i))) ? SIGTERM : SIGHUP) ;
   }
+  return 1 ;
 }
 
+static inline void killthem (unsigned int what)
+{
+  genset_iter(services, &killthem_iter, &what) ;
+}
+
+
+ /* Triggered action: reaper */
 
-/* Second essential function: the scanner.
-   It monitors the service directories and spawns a supervisor
-   if needed. */
+ /*
+   sv->p values:
+   0+ : this end of the pipe
+   -1 : not a logged service
+   -2 : inactive and peer dead, do not reactivate
+   -3 : reactivation wanted, trigger rescan on death
+ */
 
-static void trystart (unsigned int i, char const *name, int islog)
+static void remove_service (service *sv)
 {
-  pid_t pid = fork() ;
-  switch (pid)
+  LOLDEBUG("remove_service: %u", sv - SERVICE(0)) ;
+  if (sv->peer < max)
   {
-    case -1 :
-      tain_addsec_g(&services[i].restartafter[islog], CHECK_RETRY_TIMEOUT) ;
-      strerr_warnwu2sys("fork for ", name) ;
-      return ;
-    case 0 :
+    service *peer = SERVICE(sv->peer) ;
+    if (peer->p >= 0)
     {
-      char const *cargv[3] = { "s6-supervise", name, 0 } ;
-      PROG = "s6-svscan (child)" ;
-      selfpipe_finish() ;
-      if (services[i].flaglog)
-        if (fd_move(!islog, services[i].p[!islog]) == -1)
-          strerr_diefu2sys(111, "set fds for ", name) ;
-      if (consoleholder >= 0 && services[i].flagspecial
-       && fd_move(2, consoleholder) < 0)  /* autoclears coe */
-         strerr_diefu2sys(111, "restore console fd for service ", name) ;
-      xexec_a(S6_BINPREFIX "s6-supervise", cargv) ;
+      close(peer->p) ;
+      peer->p = -2 ;
     }
+    peer->peer = max ;
   }
-  services[i].pid[islog] = pid ;
+  if (sv->p == -3) tain_earliest1(&scan_deadline, &sv->start) ;
+  else if (sv->p >= 0) close(sv->p) ;
+  avltreen_delete(by_devino, &sv->devino) ;
+  genset_delete(services, sv - SERVICE(0)) ;
 }
 
-static void retrydirlater (void)
+static void reap (void)
 {
-  tain a ;
-  tain_addsec_g(&a, DIR_RETRY_TIMEOUT) ;
-  if (tain_less(&a, &deadline)) deadline = a ;
+  LOLDEBUG("reap") ;
+  for (;;)
+  {
+    uint32_t i ;
+    int wstat ;
+    pid_t pid = wait_nohang(&wstat) ;
+    if (pid < 0)
+      if (errno != ECHILD) panic("wait_nohang") ;
+      else break ;
+    else if (!pid) break ;
+    else
+    {
+      LOLDEBUG("reap: pid %llu", (unsigned long long)pid) ;
+      if (avltreen_search(by_pid, &pid, &i))
+      {
+        service *sv = SERVICE(i) ;
+        LOLDEBUG("reap: pid %llu is service %u", (unsigned long long)pid, i) ;
+        avltreen_delete(by_pid, &pid) ;
+        sv->pid = 0 ;
+        if (bitarray_peek(active, i)) tain_earliest1(&start_deadline, &sv->start) ;
+        else remove_service(sv) ;
+      }
+    }
+  }
 }
 
-static inline void check (char const *name)
+
+ /*
+    On-timeout action: scanner.
+    (This can be triggered, but the trigger just sets the timeout to 0.)
+    It's on-timeout because it can fail and get rescheduled for later.
+ */
+
+static int check (char const *name, uint32_t prod, char *act)
 {
   struct stat st ;
-  size_t namelen ;
-  unsigned int i = 0 ;
-  if (name[0] == '.') return ;
+  devino di ;
+  uint32_t i ;
+  service *sv ;
+  LOLDEBUG("checking %s (producer is %u)", name, prod) ;
   if (stat(name, &st) == -1)
   {
+    if (prod < max && errno == ENOENT)
+    {
+      if (SERVICE(prod)->peer < max)
+        strerr_warnw3x("logger for service ", NAME(prod), " has been moved") ;
+      return max ;
+    }
     strerr_warnwu2sys("stat ", name) ;
-    retrydirlater() ;
-    return ;
+    return -4 ;
   }
-  if (!S_ISDIR(st.st_mode)) return ;
-  namelen = strlen(name) ;
-  for (; i < n ; i++) if ((services[i].ino == st.st_ino) && (services[i].dev == st.st_dev)) break ;
-  if (i < n)
+  if (!S_ISDIR(st.st_mode)) return max ;
+  di.dev = st.st_dev ;
+  di.ino = st.st_ino ;
+  if (avltreen_search(by_devino, &di, &i))
   {
-    if (services[i].flaglog && (services[i].p[0] < 0))
+    LOLDEBUG("check: existing service %u", i) ;
+    sv = SERVICE(i) ;
+    if (sv->peer < max)
     {
-     /* See BLACK MAGIC above. */
-      services[i].p[0] = -2 ;
-      return ;
+      if (prod < max && prod != sv->peer)
+      {
+        strerr_warnw3x("old service ", name, " still exists, waiting") ;
+        return -10 ;
+      }
+      if (sv->p == -1)
+      {
+        sv->p = -2 ;
+        return i ;
+      }
     }
   }
   else
   {
-    if (n >= max)
+    i = genset_new(services) ;
+    if (i >= max)
     {
       strerr_warnwu3x("start supervisor for ", name, ": too many services") ;
-      return ;
+      return -60 ;
     }
-    else
+    LOLDEBUG("check: new service %u", i) ;
+    sv = SERVICE(i) ;
+    sv->devino = di ;
+    sv->pid = 0 ;
+    tain_copynow(&sv->start) ;
+    tain_copynow(&start_deadline) ; /* XXX: may cause a superfluous start if logger fails, oh well */
+    if (prod >= max)
     {
-      if (!strcmp(name, SPECIAL_LOGGER_SERVICE))
+      sv->peer = max ;
+      sv->p = -1 ;
+      if (special >= max && !strcmp(name, SPECIAL_LOGGER_SERVICE))
       {
-        services[i].flagspecial = 1 ;
-        services[i].flaglog = 0 ;
+        special = i ;
+        LOLDEBUG("check: %u is special", i) ;
       }
-      else
+    }
+    else
+    {
+      int p[2] ;
+      if (pipecoe(p) == -1)
       {
-        struct stat su ;
-        char tmp[namelen + 5] ;
-        services[i].flagspecial = 0 ;
-        memcpy(tmp, name, namelen) ;
-        memcpy(tmp + namelen, "/log", 5) ;
-        if (stat(tmp, &su) < 0)
-          if (errno == ENOENT) services[i].flaglog = 0 ;
-          else
-          {
-            strerr_warnwu2sys("stat ", tmp) ;
-            retrydirlater() ;
-            return ;
-          }
-        else if (!S_ISDIR(su.st_mode))
-          services[i].flaglog = 0 ;
-        else
-        {
-          if (pipecoe(services[i].p) < 0)
-          {
-            strerr_warnwu1sys("pipecoe") ;
-            retrydirlater() ;
-            return ;
-          }
-          services[i].flaglog = 1 ;
-        }
+        strerr_warnwu2sys("create pipe for ", name) ;
+        genset_delete(services, i) ;
+        return -3 ;
       }
-      services[i].ino = st.st_ino ;
-      services[i].dev = st.st_dev ;
-      tain_copynow(&services[i].restartafter[0]) ;
-      tain_copynow(&services[i].restartafter[1]) ;
-      services[i].pid[0] = 0 ;
-      services[i].pid[1] = 0 ;
-      n++ ;
+      sv->peer = prod ;
+      sv->p = p[0] ;
+      SERVICE(prod)->peer = i ;
+      SERVICE(prod)->p = p[1] ;
+      LOLDEBUG("check: %u paired with %u", i, prod) ;
     }
+    avltreen_insert(by_devino, i) ;
   }
-  
-  services[i].flagactive = 1 ;
+  strcpy(NAME(i), name) ;
+  bitarray_set(act, i) ;
+  return i ;
+}
 
-  if (services[i].flaglog && !services[i].pid[1])
-  {
-    if (!tain_future(&services[i].restartafter[1]))
-    {
-      char tmp[namelen + 5] ;
-      memcpy(tmp, name, namelen) ;
-      memcpy(tmp + namelen, "/log", 5) ;
-      trystart(i, tmp, 1) ;
-    }
-    else if (tain_less(&services[i].restartafter[1], &deadline))
-      deadline = services[i].restartafter[1] ;
-  }
+static void set_scan_timeout (unsigned int n)
+{
+  tain a ;
+  tain_addsec_g(&a, n) ;
+  tain_earliest1(&scan_deadline, &a) ;
+  LOLDEBUG("set_scan_timeout to %u", n) ;
+}
 
-  if (!services[i].pid[0])
+static int remove_deadinactive_iter (void *data, void *aux)
+{
+  service *sv = data ;
+  uint32_t *n = aux ;
+  if (!bitarray_peek(active, sv - SERVICE(0)))
   {
-    if (!tain_future(&services[i].restartafter[0]))
-      trystart(i, name, 0) ;
-    else if (tain_less(&services[i].restartafter[0], &deadline))
-      deadline = services[i].restartafter[0] ;
+    LOLDEBUG("scan: %u is inactive", sv - SERVICE(0)) ;
+    if (!sv->pid) remove_service(sv) ;
+    if (!--n) return 0 ;
   }
+  return 1 ;
 }
 
-static inline void scan (void)
+static void scan (void)
 {
-  unsigned int i = 0 ;
-  DIR *dir ;
-  if (!wantscan) return ;
-  wantscan = 0 ;
-  tain_add_g(&deadline, &defaulttimeout) ;
-  dir = opendir(".") ;
+  DIR *dir = opendir(".") ;
+  char tmpactive[bitarray_div8(max)] ;
+  tain_add_g(&scan_deadline, &scantto) ;
+  memset(tmpactive, 0, bitarray_div8(max)) ;
+  LOLDEBUG("scan") ;
   if (!dir)
   {
     strerr_warnwu1sys("opendir .") ;
-    retrydirlater() ;
+    set_scan_timeout(5) ;
     return ;
   }
-  for (; i < n ; i++) services[i].flagactive = 0 ;
   for (;;)
   {
+    int i ;
+    size_t len ;
     direntry *d ;
     errno = 0 ;
     d = readdir(dir) ;
     if (!d) break ;
-    check(d->d_name) ;
+    if (d->d_name[0] == '.') continue ;
+    len = strlen(d->d_name) ;
+    if (len > namemax)
+    {
+      strerr_warnw2x("name too long - not spawning service: ", d->d_name) ;
+      continue ;
+    }
+    i = check(d->d_name, max, tmpactive) ;
+    if (i < 0)
+    {
+      dir_close(dir) ;
+      set_scan_timeout(-i) ;
+      return ;
+    }
+    if (i < max)
+    {
+      char logname[len + 5] ;
+      memcpy(logname, d->d_name, len) ;
+      memcpy(logname + len, "/log", 5) ;
+      if (check(logname, i, tmpactive) < 0)
+      {
+        genset_delete(services, i) ;
+        dir_close(dir) ;
+        set_scan_timeout(-i) ;
+        return ;
+      }
+    }
   }
+  dir_close(dir) ;
   if (errno)
   {
     strerr_warnwu1sys("readdir .") ;
-    retrydirlater() ;
+    set_scan_timeout(5) ;
+    return ;
   }
-  dir_close(dir) ;
-  for (i = 0 ; i < n ; i++) if (!services[i].flagactive && !services[i].pid[0])
+  memcpy(active, tmpactive, bitarray_div8(max)) ;
+
   {
-    if (services[i].flaglog)
+    uint32_t n = genset_n(services) - avltreen_len(by_devino) ;
+    if (n) genset_iter(services, &remove_deadinactive_iter, &n) ;
+  }
+  LOLDEBUG("scan: end") ;
+}
+
+
+ /*
+    On-timeout action: starter.
+    This cannot be user-triggered. It runs when a service needs to (re)start.
+ */
+
+static int start_iter (void *data, void *aux)
+{
+  service *sv = data ;
+  uint32_t i = sv - SERVICE(0) ;
+  if (!bitarray_peek(active, i)
+   || sv->pid
+   || tain_future(&sv->start)) return 1 ;
+  LOLDEBUG("start: spawning %u", i) ;
+  sv->pid = fork() ;
+  switch (sv->pid)
+  {
+    case -1 :
+      sv->pid = 0 ;
+      strerr_warnwu2sys("fork", NAME(i)) ;
+      tain_addsec_g(&start_deadline, 10) ;
+      return 0 ;
+    case 0 :
     {
-      if (services[i].pid[1]) continue ;
-      if (services[i].p[0] >= 0)
+      char const *cargv[3] = { "s6-supervise", NAME(i), 0 } ;
+      PROG = "s6-svscan (child)" ;
+      if (sv->peer < max)
+      {
+        if (fd_move(!is_logger(i), sv->p) == -1)
+          strerr_diefu2sys(111, "dup2 pipe for ", NAME(i)) ;
+      }
+      if (consoleholder && i == special)
       {
-        fd_close(services[i].p[1]) ; services[i].p[1] = -1 ;
-        fd_close(services[i].p[0]) ; services[i].p[0] = -1 ;
+        if (fd_move(2, consoleholder) == -1)
+         strerr_diefu2sys(111, "restore console fd for service ", NAME(i)) ;
       }
+      selfpipe_finish() ;
+      xexec_a(S6_BINPREFIX "s6-supervise", cargv) ;
     }
-    services[i] = services[--n] ;
   }
+  LOLDEBUG("start: by_pid has %u nodes, inserting new pid %llu", avltreen_len(by_pid), (unsigned long long)sv->pid) ;
+  avltreen_insert(by_pid, i) ;
+  tain_addsec_g(&sv->start, 1) ;
+  (void)aux ;
+  return 1 ;
 }
 
+static inline void start (void)
+{
+  start_deadline = tain_infinite ;
+  genset_iter(services, &start_iter, 0) ;
+}
+
+
+ /* Main. */
+
 static inline int control_init (void)
 {
   mode_t m = umask(0) ;
@@ -546,114 +648,145 @@ static inline int control_init (void)
 
 int main (int argc, char const *const *argv)
 {
-  iopause_fd x[2] = { { -1, IOPAUSE_READ, 0 }, { -1, IOPAUSE_READ, 0 } } ;
-  int notif = -1 ;
+  iopause_fd x[2] = { { .fd = -1, .events = IOPAUSE_READ }, { .fd = -1, .events = IOPAUSE_READ } } ;
   PROG = "s6-svscan" ;
+
   {
     subgetopt l = SUBGETOPT_ZERO ;
+    unsigned int notif = 0 ;
     unsigned int t = 0 ;
     for (;;)
     {
-      int opt = subgetopt_r(argc, argv, "c:t:d:X:", &l) ;
+      int opt = subgetopt_r(argc, argv, "c:C:L:t:d:X:", &l) ;
       if (opt == -1) break ;
       switch (opt)
       {
-        case 'c' : if (uint0_scan(l.arg, &max)) break ;
-        case 't' : if (uint0_scan(l.arg, &t)) break ;
-        case 'd' : { unsigned int u ; if (!uint0_scan(l.arg, &u)) dieusage() ; notif = u ; break ; }
-        case 'X' : { unsigned int u ; if (!uint0_scan(l.arg, &u)) dieusage() ; consoleholder = u ; break ; }
+        case 'c' : if (!uint320_scan(l.arg, &max)) dieusage() ; max <<= 1 ; break ;
+        case 'C' : if (!uint320_scan(l.arg, &max)) dieusage() ; break ;
+        case 'L' : if (!uint320_scan(l.arg, &namemax)) dieusage() ; break ;
+        case 't' : if (!uint0_scan(l.arg, &t)) dieusage() ; break ;
+        case 'd' : if (!uint0_scan(l.arg, &notif)) dieusage() ; break ;
+        case 'X' : if (!uint0_scan(l.arg, &consoleholder)) dieusage() ; break ;
         default : dieusage() ;
       }
     }
     argc -= l.ind ; argv += l.ind ;
-    if (t) tain_from_millisecs(&defaulttimeout, t) ;
-    else defaulttimeout = tain_infinite_relative ;
-    if (max < 2) max = 2 ;
-    if (max > 90000) max = 90000 ;
-  }
-
-  if (notif >= 0)
-  {
-    if (notif < 3) strerr_dief1x(100, "notification fd must be 3 or more") ;
-    if (fcntl(notif, F_GETFD) < 0) strerr_dief1sys(100, "invalid notification fd") ;
-  }
-  if (consoleholder >= 0)
-  {
-    if (consoleholder < 3) strerr_dief1x(100, "console holder fd must be 3 or more") ;
-    if (fcntl(consoleholder, F_GETFD) < 0) strerr_dief1sys(100, "invalid console holder fd") ;
-    if (coe(consoleholder) < 0) strerr_diefu1sys(111, "coe console holder") ;
-  }
-  if (!fd_sanitize()) strerr_diefu1x(100, "sanitize standard fds") ;
+    if (t) tain_from_millisecs(&scantto, t) ;
+    if (max < 4) max = 4 ;
+    if (max > 160000) max = 160000 ;
+    special = max ;
+    if (namemax < 11) namemax = 11 ;
+    if (namemax > 1019) namemax = 1019 ;
+
+    if (notif)
+    {
+      if (notif < 3) strerr_dief1x(100, "notification fd must be 3 or more") ;
+      if (fcntl(notif, F_GETFD) == -1) strerr_dief1sys(100, "invalid notification fd") ;
+    }
+    if (consoleholder)
+    {
+      if (consoleholder < 3) strerr_dief1x(100, "console holder fd must be 3 or more") ;
+      if (fcntl(consoleholder, F_GETFD) < 0) strerr_dief1sys(100, "invalid console holder fd") ;
+      if (coe(consoleholder) == -1) strerr_diefu1sys(111, "coe console holder") ;
+    }
+    if (!fd_sanitize()) strerr_diefu1x(100, "sanitize standard fds") ;
 
-  if (argc && (chdir(argv[0]) < 0)) strerr_diefu1sys(111, "chdir") ;
-  x[1].fd = control_init() ;
-  x[0].fd = selfpipe_init() ;
-  if (x[0].fd < 0) strerr_diefu1sys(111, "selfpipe_init") ;
+    if (argc && (chdir(argv[0]) == -1)) strerr_diefu1sys(111, "chdir") ;
+    x[1].fd = control_init() ;
+    x[0].fd = selfpipe_init() ;
+    if (x[0].fd < 0) strerr_diefu1sys(111, "selfpipe_init") ;
 
-  if (!sig_altignore(SIGPIPE)) strerr_diefu1sys(111, "ignore SIGPIPE") ;
-  {
-    sigset_t set ;
-    sigemptyset(&set) ;
-    sigaddset(&set, SIGCHLD) ;
-    sigaddset(&set, SIGALRM) ;
-    sigaddset(&set, SIGABRT) ;
-    sigaddset(&set, SIGHUP) ;
-    sigaddset(&set, SIGINT) ;
-    sigaddset(&set, SIGTERM) ;
-    sigaddset(&set, SIGQUIT) ;
-    sigaddset(&set, SIGUSR1) ;
-    sigaddset(&set, SIGUSR2) ;
+    if (!sig_altignore(SIGPIPE)) strerr_diefu1sys(111, "ignore SIGPIPE") ;
+    {
+      sigset_t set ;
+      sigemptyset(&set) ;
+      sigaddset(&set, SIGCHLD) ;
+      sigaddset(&set, SIGALRM) ;
+      sigaddset(&set, SIGABRT) ;
+      sigaddset(&set, SIGHUP) ;
+      sigaddset(&set, SIGINT) ;
+      sigaddset(&set, SIGTERM) ;
+      sigaddset(&set, SIGQUIT) ;
+      sigaddset(&set, SIGUSR1) ;
+      sigaddset(&set, SIGUSR2) ;
 #ifdef SIGPWR
-    sigaddset(&set, SIGPWR) ;
+      sigaddset(&set, SIGPWR) ;
 #endif
 #ifdef SIGWINCH
-    sigaddset(&set, SIGWINCH) ;
+      sigaddset(&set, SIGWINCH) ;
 #endif
-    if (!selfpipe_trapset(&set)) strerr_diefu1sys(111, "trap signals") ;
-  }
-  if (notif >= 0)
-  {
-    fd_write(notif, "\n", 1) ;
-    fd_close(notif) ;
-    notif = -1 ;
+      if (!selfpipe_trapset(&set)) strerr_diefu1sys(111, "trap signals") ;
+    }
+    if (notif)
+    {
+      write(notif, "\n", 1) ;
+      close(notif) ;
+    }
   }
 
   {
-    struct svinfo_s blob[max] ; /* careful with that stack, Eugene */
-    services = blob ;
+    service services_storage[max] ;
+    uint32_t services_freelist[max] ;
+    avlnode bydevino_storage[max] ;
+    uint32_t bydevino_freelist[max] ;
+    avlnode bypid_storage[max] ;
+    uint32_t bypid_freelist[max] ;
+    genset services_info ;
+    avltreen bydevino_info ;
+    avltreen bypid_info ;
+    char name_storage[max * (namemax + 5)] ;
+    char active_storage[bitarray_div8(max)] ;
+
+    GENSET_init(&services_info, service, services_storage, services_freelist, max) ;
+    avltreen_init(&bydevino_info, bydevino_storage, bydevino_freelist, max, &bydevino_dtok, &bydevino_cmp, &services_info) ;
+    avltreen_init(&bypid_info, bypid_storage, bypid_freelist, max, &bypid_dtok, &bypid_cmp, &services_info) ;
+    services = &services_info ;
+    by_devino = &bydevino_info ;
+    by_pid = &bypid_info ;
+    names = name_storage ;
+    active = active_storage ;
+
     tain_now_set_stopwatch_g() ;
 
     /* From now on, we must not die.
        Temporize on recoverable errors, and panic on serious ones. */
 
-    while (cont)
+    while (flags.cont)
     {
       int r ;
-      reap() ;
-      scan() ;
-      killthem() ;
+      tain deadline = scan_deadline ;
+      LOLDEBUG("loop") ;
+      tain_earliest1(&deadline, &start_deadline) ;
       r = iopause_g(x, 2, &deadline) ;
       if (r < 0) panic("iopause") ;
-      else if (!r) wantscan = 1 ;
+      else if (!r)
+      {
+        LOLDEBUG("loop: timeout") ;
+        if (!tain_future(&scan_deadline)) scan() ;
+        if (!tain_future(&start_deadline)) start() ;
+      }
       else
       {
+        unsigned int what = 0 ;
+        LOLDEBUG("loop: event") ;
         if ((x[0].revents | x[1].revents) & IOPAUSE_EXCEPT)
         {
           errno = EIO ;
           panic("check internal pipes") ;
         }
-        if (x[0].revents & IOPAUSE_READ) handle_signals() ;
-        if (x[1].revents & IOPAUSE_READ) handle_control(x[1].fd) ;
+        if (x[0].revents & IOPAUSE_READ) handle_signals(&what) ;
+        if (x[1].revents & IOPAUSE_READ) handle_control(x[1].fd, &what) ;
+        if (what & 7) killthem(what & 7) ;
+        if (what & 8) reap() ;
       }
     }
-
+    LOLDEBUG("exiting loop") ;
 
     /* Finish phase. */
 
-    killthem() ;
-    closethem() ;
+    close_pipes() ;
     restore_console() ;
-    if (waitall) waitthem() ; else { chld() ; reap() ; }
+    if (flags.waitall) waitthem() ;
     selfpipe_finish() ;
   }
   {