about summary refs log tree commit diff
path: root/sysdeps
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2021-02-13 11:47:46 -0800
committerH.J. Lu <hjl.tools@gmail.com>2021-07-14 06:33:58 -0700
commitd8ea0d0168b190bdf138a20358293c939509367f (patch)
tree09f489ad57652978fe9c8dbf8f649d6868bafbda /sysdeps
parent135425a1dd50cbe2b9db0628d6c2b36c7889f30b (diff)
downloadglibc-d8ea0d0168b190bdf138a20358293c939509367f.tar.gz
glibc-d8ea0d0168b190bdf138a20358293c939509367f.tar.xz
glibc-d8ea0d0168b190bdf138a20358293c939509367f.zip
Add an internal wrapper for clone, clone2 and clone3
The clone3 system call (since Linux 5.3) provides a superset of the
functionality of clone and clone2.  It also provides a number of API
improvements, including the ability to specify the size of the child's
stack area which can be used by kernel to compute the shadow stack size
when allocating the shadow stack.  Add:

extern int __clone_internal (struct clone_args *__cl_args,
			     int (*__func) (void *__arg), void *__arg);

to provide an abstract interface for clone, clone2 and clone3.

1. Simplify stack management for thread creation by passing both stack
base and size to create_thread.
2. Consolidate clone vs clone2 differences into a single file.
3. Call __clone3 if HAVE_CLONE3_WAPPER is defined.  If __clone3 returns
-1 with ENOSYS, fall back to clone or clone2.
4. Use only __clone_internal to clone a thread.  Since the stack size
argument for create_thread is now unconditional, always pass stack size
to create_thread.
5. Enable the public clone3 wrapper in the future after it has been
added to all targets.

NB: Sandbox will return ENOSYS on clone3 in both Chromium:

The following revision refers to this bug:
  https://chromium.googlesource.com/chromium/src/+/218438259dd795456f0a48f67cbe5b4e520db88b

commit 218438259dd795456f0a48f67cbe5b4e520db88b
Author: Matthew Denton <mpdenton@chromium.org>
Date: Thu Jun 03 20:06:13 2021

Linux sandbox: return ENOSYS for clone3

Because clone3 uses a pointer argument rather than a flags argument, we
cannot examine the contents with seccomp, which is essential to
preventing sandboxed processes from starting other processes. So, we
won't be able to support clone3 in Chromium. This CL modifies the
BPF policy to return ENOSYS for clone3 so glibc always uses the fallback
to clone.

Bug: 1213452
Change-Id: I7c7c585a319e0264eac5b1ebee1a45be2d782303
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2936184
Reviewed-by: Robert Sesek <rsesek@chromium.org>
Commit-Queue: Matthew Denton <mpdenton@chromium.org>
Cr-Commit-Position: refs/heads/master@{#888980}

[modify] https://crrev.com/218438259dd795456f0a48f67cbe5b4e520db88b/sandbox/linux/seccomp-bpf-helpers/baseline_policy.cc

and Firefox:

https://hg.mozilla.org/integration/autoland/rev/ecb4011a0c76

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/unix/sysv/linux/Makefile3
-rw-r--r--sysdeps/unix/sysv/linux/clone-internal.c91
-rw-r--r--sysdeps/unix/sysv/linux/clone3.c1
-rw-r--r--sysdeps/unix/sysv/linux/clone3.h67
-rw-r--r--sysdeps/unix/sysv/linux/spawni.c26
5 files changed, 170 insertions, 18 deletions
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index feb8fd4ce1..ed0c0d27f4 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -65,7 +65,8 @@ sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   xstat fxstat lxstat xstat64 fxstat64 lxstat64 \
 		   fxstatat fxstatat64 \
 		   xmknod xmknodat convert_scm_timestamps \
-		   closefrom_fallback
+		   closefrom_fallback \
+		   clone3 clone-internal
 
 CFLAGS-gethostid.c = -fexceptions
 CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
new file mode 100644
index 0000000000..1e7a8f6b35
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-internal.c
@@ -0,0 +1,91 @@
+/* The internal wrapper of clone and clone3.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <stddef.h>
+#include <errno.h>
+#include <sched.h>
+#include <clone_internal.h>
+#include <libc-pointer-arith.h>	/* For cast_to_pointer.  */
+#include <stackinfo.h>		/* For _STACK_GROWS_{UP,DOWN}.  */
+
+#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
+#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
+#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+
+#define sizeof_field(TYPE, MEMBER) sizeof ((((TYPE *)0)->MEMBER))
+#define offsetofend(TYPE, MEMBER) \
+  (offsetof (TYPE, MEMBER) + sizeof_field (TYPE, MEMBER))
+
+_Static_assert (__alignof (struct clone_args) == 8,
+		"__alignof (struct clone_args) != 8");
+_Static_assert (offsetofend (struct clone_args, tls) == CLONE_ARGS_SIZE_VER0,
+		"offsetofend (struct clone_args, tls) != CLONE_ARGS_SIZE_VER0");
+_Static_assert (offsetofend (struct clone_args, set_tid_size) == CLONE_ARGS_SIZE_VER1,
+		"offsetofend (struct clone_args, set_tid_size) != CLONE_ARGS_SIZE_VER1");
+_Static_assert (offsetofend (struct clone_args, cgroup) == CLONE_ARGS_SIZE_VER2,
+		"offsetofend (struct clone_args, cgroup) != CLONE_ARGS_SIZE_VER2");
+_Static_assert (sizeof (struct clone_args) == CLONE_ARGS_SIZE_VER2,
+		"sizeof (struct clone_args) != CLONE_ARGS_SIZE_VER2");
+
+int
+__clone_internal (struct clone_args *cl_args,
+		  int (*func) (void *arg), void *arg)
+{
+  int ret;
+#ifdef HAVE_CLONE3_WAPPER
+  /* Try clone3 first.  */
+  int saved_errno = errno;
+  ret = __clone3 (cl_args, sizeof (*cl_args), func, arg);
+  if (ret != -1 || errno != ENOSYS)
+    return ret;
+
+  /* NB: Restore errno since errno may be checked against non-zero
+     return value.  */
+  __set_errno (saved_errno);
+#endif
+
+  /* Map clone3 arguments to clone arguments.  NB: No need to check
+     invalid clone3 specific bits in flags nor exit_signal since this
+     is an internal function.  */
+  int flags = cl_args->flags | cl_args->exit_signal;
+  void *stack = cast_to_pointer (cl_args->stack);
+
+#ifdef __ia64__
+  ret = __clone2 (func, stack, cl_args->stack_size,
+		  flags, arg,
+		  cast_to_pointer (cl_args->parent_tid),
+		  cast_to_pointer (cl_args->tls),
+		  cast_to_pointer (cl_args->child_tid));
+#else
+# if !_STACK_GROWS_DOWN && !_STACK_GROWS_UP
+#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
+# endif
+
+# if _STACK_GROWS_DOWN
+  stack += cl_args->stack_size;
+# endif
+  ret = __clone (func, stack, flags, arg,
+		 cast_to_pointer (cl_args->parent_tid),
+		 cast_to_pointer (cl_args->tls),
+		 cast_to_pointer (cl_args->child_tid));
+#endif
+  return ret;
+}
+
+libc_hidden_def (__clone_internal)
diff --git a/sysdeps/unix/sysv/linux/clone3.c b/sysdeps/unix/sysv/linux/clone3.c
new file mode 100644
index 0000000000..de963ef89d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone3.c
@@ -0,0 +1 @@
+/* An empty placeholder.  */
diff --git a/sysdeps/unix/sysv/linux/clone3.h b/sysdeps/unix/sysv/linux/clone3.h
new file mode 100644
index 0000000000..1e35ff6422
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone3.h
@@ -0,0 +1,67 @@
+/* The wrapper of clone3.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _CLONE3_H
+#define _CLONE3_H	1
+
+#include <features.h>
+#include <stddef.h>
+#include <bits/types.h>
+
+__BEGIN_DECLS
+
+/* The unsigned 64-bit and 8-byte aligned integer type.  */
+typedef __U64_TYPE __aligned_uint64_t __attribute__ ((__aligned__ (8)));
+
+/* This struct should only be used in an argument to the clone3 system
+   call (along with its size argument).  It may be extended with new
+   fields in the future.  */
+
+struct clone_args
+{
+  /* Flags bit mask.  */
+  __aligned_uint64_t flags;
+  /* Where to store PID file descriptor (pid_t *).  */
+  __aligned_uint64_t pidfd;
+  /* Where to store child TID, in child's memory (pid_t *).  */
+  __aligned_uint64_t child_tid;
+  /* Where to store child TID, in parent's memory (int *). */
+  __aligned_uint64_t parent_tid;
+  /* Signal to deliver to parent on child termination */
+  __aligned_uint64_t exit_signal;
+  /* The lowest address of stack.  */
+  __aligned_uint64_t stack;
+  /* Size of stack.  */
+  __aligned_uint64_t stack_size;
+  /* Location of new TLS.  */
+  __aligned_uint64_t tls;
+  /* Pointer to a pid_t array (since Linux 5.5).  */
+  __aligned_uint64_t set_tid;
+  /* Number of elements in set_tid (since Linux 5.5). */
+  __aligned_uint64_t set_tid_size;
+  /* File descriptor for target cgroup of child (since Linux 5.7).  */
+  __aligned_uint64_t cgroup;
+};
+
+/* The wrapper of clone3.  */
+extern int clone3 (struct clone_args *__cl_args, size_t __size,
+		   int (*__func) (void *__arg), void *__arg);
+
+__END_DECLS
+
+#endif /* clone3.h */
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index f7e7353a05..6b0bade4d4 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -26,6 +26,7 @@
 #include <spawn_int.h>
 #include <sysdep.h>
 #include <sys/resource.h>
+#include <clone_internal.h>
 
 /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
    with CLONE_VM and CLONE_VFORK flags and an allocated stack.  The new stack
@@ -53,21 +54,6 @@
    normal program exit with the exit code 127.  */
 #define SPAWN_ERROR	127
 
-#ifdef __ia64__
-# define CLONE(__fn, __stackbase, __stacksize, __flags, __args) \
-  __clone2 (__fn, __stackbase, __stacksize, __flags, __args, 0, 0, 0)
-#else
-# define CLONE(__fn, __stack, __stacksize, __flags, __args) \
-  __clone (__fn, __stack, __flags, __args)
-#endif
-
-/* Since ia64 wants the stackbase w/clone2, re-use the grows-up macro.  */
-#if _STACK_GROWS_UP || defined (__ia64__)
-# define STACK(__stack, __stack_size) (__stack)
-#elif _STACK_GROWS_DOWN
-# define STACK(__stack, __stack_size) (__stack + __stack_size)
-#endif
-
 
 struct posix_spawn_args
 {
@@ -382,8 +368,14 @@ __spawnix (pid_t * pid, const char *file,
      need for CLONE_SETTLS.  Although parent and child share the same TLS
      namespace, there will be no concurrent access for TLS variables (errno
      for instance).  */
-  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
-		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
+  struct clone_args clone_args =
+    {
+      .flags = CLONE_VM | CLONE_VFORK,
+      .exit_signal = SIGCHLD,
+      .stack = (uintptr_t) stack,
+      .stack_size = stack_size,
+    };
+  new_pid = __clone_internal (&clone_args, __spawni_child, &args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or