From 9b7091415af47082664717210ac49d51551456ab Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 28 Feb 2024 12:08:03 -0800 Subject: x86-64: Update _dl_tlsdesc_dynamic to preserve AMX registers _dl_tlsdesc_dynamic should also preserve AMX registers which are caller-saved. Add X86_XSTATE_TILECFG_ID and X86_XSTATE_TILEDATA_ID to x86-64 TLSDESC_CALL_STATE_SAVE_MASK. Compute the AMX state size and save it in xsave_state_full_size which is only used by _dl_tlsdesc_dynamic_xsave and _dl_tlsdesc_dynamic_xsavec. This fixes the AMX part of BZ #31372. Tested on AMX processor. AMX test is enabled only for compilers with the fix for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114098 GCC 14 and GCC 11/12/13 branches have the bug fix. Reviewed-by: Sunil K Pandey --- sysdeps/unix/sysv/linux/x86_64/Makefile | 27 +++++++ sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h | 5 ++ .../sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c | 2 + .../sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c | 2 + .../sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c | 2 + sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c | 83 ++++++++++++++++++++++ sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h | 63 ++++++++++++++++ sysdeps/x86/cpu-features-offsets.sym | 1 + sysdeps/x86/cpu-features.c | 55 +++++++++++++- sysdeps/x86/include/cpu-features.h | 2 + sysdeps/x86/sysdep.h | 18 ++++- sysdeps/x86_64/configure | 28 ++++++++ sysdeps/x86_64/configure.ac | 15 ++++ sysdeps/x86_64/dl-tlsdesc-dynamic.h | 2 +- 14 files changed, 299 insertions(+), 6 deletions(-) create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c create mode 100644 sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile index 7d1d205fa0..fcbffd81cb 100644 --- a/sysdeps/unix/sysv/linux/x86_64/Makefile +++ b/sysdeps/unix/sysv/linux/x86_64/Makefile @@ -66,6 +66,33 @@ $(objpfx)libx86-64-isa-level%.os: $(..)/sysdeps/unix/sysv/linux/x86_64/x86-64-is $(objpfx)libx86-64-isa-level.so: $(objpfx)libx86-64-isa-level-1.so cp $< $@ endif + +ifeq (yes,$(have-mamx-tile)) +tests += \ + tst-gnu2-tls2-amx \ +# tests + +modules-names += \ + tst-gnu2-tls2-amx-mod0 \ + tst-gnu2-tls2-amx-mod1 \ + tst-gnu2-tls2-amx-mod2 \ +# modules-names + +$(objpfx)tst-gnu2-tls2-amx: $(shared-thread-library) +$(objpfx)tst-gnu2-tls2-amx.out: \ + $(objpfx)tst-gnu2-tls2-amx-mod0.so \ + $(objpfx)tst-gnu2-tls2-amx-mod1.so \ + $(objpfx)tst-gnu2-tls2-amx-mod2.so +$(objpfx)tst-gnu2-tls2-amx-mod0.so: $(libsupport) +$(objpfx)tst-gnu2-tls2-amx-mod1.so: $(libsupport) +$(objpfx)tst-gnu2-tls2-amx-mod2.so: $(libsupport) + +CFLAGS-tst-gnu2-tls2-amx.c += -mamx-tile +CFLAGS-tst-gnu2-tls2-amx-mod0.c += -mamx-tile -mtls-dialect=gnu2 +CFLAGS-tst-gnu2-tls2-amx-mod1.c += -mamx-tile -mtls-dialect=gnu2 +CFLAGS-tst-gnu2-tls2-amx-mod2.c += -mamx-tile -mtls-dialect=gnu2 +endif + endif # $(subdir) == elf ifneq ($(enable-cet),no) diff --git a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h index 2f511321ad..ef4631bf4b 100644 --- a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h +++ b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h @@ -20,3 +20,8 @@ # define ARCH_SHSTK_SHSTK 0x1 # define ARCH_SHSTK_WRSS 0x2 #endif + +#ifndef ARCH_GET_XCOMP_PERM +# define ARCH_GET_XCOMP_PERM 0x1022 +# define ARCH_REQ_XCOMP_PERM 0x1023 +#endif diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c new file mode 100644 index 0000000000..2e0c7b91b7 --- /dev/null +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c @@ -0,0 +1,2 @@ +#include "tst-gnu2-tls2-amx.h" +#include diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c new file mode 100644 index 0000000000..b8a8ccf1c1 --- /dev/null +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c @@ -0,0 +1,2 @@ +#include "tst-gnu2-tls2-amx.h" +#include diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c new file mode 100644 index 0000000000..cdf4a8f363 --- /dev/null +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c @@ -0,0 +1,2 @@ +#include "tst-gnu2-tls2-amx.h" +#include diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c new file mode 100644 index 0000000000..ae4dd82556 --- /dev/null +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c @@ -0,0 +1,83 @@ +/* Test TLSDESC relocation with AMX. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include "tst-gnu2-tls2-amx.h" + +extern int arch_prctl (int, ...); + +#define X86_XSTATE_TILECFG_ID 17 +#define X86_XSTATE_TILEDATA_ID 18 + +/* Initialize tile config. */ +__attribute__ ((noinline, noclone)) +static void +init_tile_config (__tilecfg *tileinfo) +{ + int i; + tileinfo->palette_id = 1; + tileinfo->start_row = 0; + + tileinfo->colsb[0] = MAX_ROWS; + tileinfo->rows[0] = MAX_ROWS; + + for (i = 1; i < 4; ++i) + { + tileinfo->colsb[i] = MAX_COLS; + tileinfo->rows[i] = MAX_ROWS; + } + + _tile_loadconfig (tileinfo); +} + +static bool +enable_amx (void) +{ + uint64_t bitmask; + if (arch_prctl (ARCH_GET_XCOMP_PERM, &bitmask) != 0) + return false; + + if ((bitmask & (1 << X86_XSTATE_TILECFG_ID)) == 0) + return false; + + if (arch_prctl (ARCH_REQ_XCOMP_PERM, X86_XSTATE_TILEDATA_ID) != 0) + return false; + + /* Load tile configuration. */ + __tilecfg tile_data = { 0 }; + init_tile_config (&tile_data); + + return true; +} + +/* An architecture can define it to clobber caller-saved registers in + malloc below to verify that the implicit TLSDESC call won't change + caller-saved registers. */ +static void +clear_tile_register (void) +{ + _tile_zero (2); +} + +#define MOD(i) "tst-gnu2-tls2-amx-mod" #i ".so" +#define IS_SUPPORTED() enable_amx () +#define PREPARE_MALLOC() clear_tile_register () + +#include diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h new file mode 100644 index 0000000000..1845a3caba --- /dev/null +++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h @@ -0,0 +1,63 @@ +/* Test TLSDESC relocation with AMX. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +#define MAX_ROWS 16 +#define MAX_COLS 64 +#define MAX 1024 +#define STRIDE 64 + +typedef struct __tile_config +{ + uint8_t palette_id; + uint8_t start_row; + uint8_t reserved_0[14]; + uint16_t colsb[16]; + uint8_t rows[16]; +} __tilecfg __attribute__ ((aligned (64))); + +/* Initialize int8_t buffer */ +static inline void +init_buffer (int8_t *buf, int8_t value) +{ + int rows, colsb, i, j; + rows = MAX_ROWS; + colsb = MAX_COLS; + + for (i = 0; i < rows; i++) + for (j = 0; j < colsb; j++) + buf[i * colsb + j] = value; +} + +#define BEFORE_TLSDESC_CALL() \ + int8_t src[MAX]; \ + int8_t res[MAX]; \ + /* Initialize src with data */ \ + init_buffer (src, 2); \ + /* Load tile rows from memory. */ \ + _tile_loadd (2, src, STRIDE); + +#define AFTER_TLSDESC_CALL() \ + /* Store the tile data to memory. */ \ + _tile_stored (2, res, STRIDE); \ + _tile_release (); \ + TEST_VERIFY_EXIT (memcmp (src, res, sizeof (res)) == 0); diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym index 6a8fd29813..21fc88d651 100644 --- a/sysdeps/x86/cpu-features-offsets.sym +++ b/sysdeps/x86/cpu-features-offsets.sym @@ -3,3 +3,4 @@ #include XSAVE_STATE_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_size) +XSAVE_STATE_FULL_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_full_size) diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 0ad0a78f67..e7c7ece462 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -308,6 +308,8 @@ update_active (struct cpu_features *cpu_features) __cpuid_count (0xd, 0, eax, ebx, ecx, edx); if (ebx != 0) { + /* NB: On AMX capable processors, ebx always includes AMX + states. */ unsigned int xsave_state_full_size = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64); @@ -321,6 +323,11 @@ update_active (struct cpu_features *cpu_features) { unsigned int xstate_comp_offsets[32]; unsigned int xstate_comp_sizes[32]; +#ifdef __x86_64__ + unsigned int xstate_amx_comp_offsets[32]; + unsigned int xstate_amx_comp_sizes[32]; + unsigned int amx_ecx; +#endif unsigned int i; xstate_comp_offsets[0] = 0; @@ -328,16 +335,39 @@ update_active (struct cpu_features *cpu_features) xstate_comp_offsets[2] = 576; xstate_comp_sizes[0] = 160; xstate_comp_sizes[1] = 256; +#ifdef __x86_64__ + xstate_amx_comp_offsets[0] = 0; + xstate_amx_comp_offsets[1] = 160; + xstate_amx_comp_offsets[2] = 576; + xstate_amx_comp_sizes[0] = 160; + xstate_amx_comp_sizes[1] = 256; +#endif for (i = 2; i < 32; i++) { - if ((STATE_SAVE_MASK & (1 << i)) != 0) + if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0) { __cpuid_count (0xd, i, eax, ebx, ecx, edx); - xstate_comp_sizes[i] = eax; +#ifdef __x86_64__ + /* Include this in xsave_state_full_size. */ + amx_ecx = ecx; + xstate_amx_comp_sizes[i] = eax; + if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0) + { + /* Exclude this from xsave_state_size. */ + ecx = 0; + xstate_comp_sizes[i] = 0; + } + else +#endif + xstate_comp_sizes[i] = eax; } else { +#ifdef __x86_64__ + amx_ecx = 0; + xstate_amx_comp_sizes[i] = 0; +#endif ecx = 0; xstate_comp_sizes[i] = 0; } @@ -350,6 +380,15 @@ update_active (struct cpu_features *cpu_features) if ((ecx & (1 << 1)) != 0) xstate_comp_offsets[i] = ALIGN_UP (xstate_comp_offsets[i], 64); +#ifdef __x86_64__ + xstate_amx_comp_offsets[i] + = (xstate_amx_comp_offsets[i - 1] + + xstate_amx_comp_sizes[i - 1]); + if ((amx_ecx & (1 << 1)) != 0) + xstate_amx_comp_offsets[i] + = ALIGN_UP (xstate_amx_comp_offsets[i], + 64); +#endif } } @@ -358,6 +397,18 @@ update_active (struct cpu_features *cpu_features) = xstate_comp_offsets[31] + xstate_comp_sizes[31]; if (size) { +#ifdef __x86_64__ + unsigned int amx_size + = (xstate_amx_comp_offsets[31] + + xstate_amx_comp_sizes[31]); + amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET, + 64); + /* Set xsave_state_full_size to the compact AMX + state size for XSAVEC. NB: xsave_state_full_size + is only used in _dl_tlsdesc_dynamic_xsave and + _dl_tlsdesc_dynamic_xsavec. */ + cpu_features->xsave_state_full_size = amx_size; +#endif cpu_features->xsave_state_size = ALIGN_UP (size + STATE_SAVE_OFFSET, 64); CPU_FEATURE_SET (cpu_features, XSAVEC); diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h index b9bf3115b6..cd7bd27cf3 100644 --- a/sysdeps/x86/include/cpu-features.h +++ b/sysdeps/x86/include/cpu-features.h @@ -934,6 +934,8 @@ struct cpu_features /* The full state size for XSAVE when XSAVEC is disabled by GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC + + and the AMX state size when XSAVEC is available. */ unsigned int xsave_state_full_size; /* Data cache size for use in memory and string routines, typically diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index 485cad9c02..db8e576e91 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -56,6 +56,14 @@ | (1 << X86_XSTATE_ZMM_H_ID) \ | (1 << X86_XSTATE_ZMM_ID) \ | (1 << X86_XSTATE_APX_F_ID)) + +/* AMX state mask. */ +# define AMX_STATE_SAVE_MASK \ + ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID)) + +/* States to be included in xsave_state_full_size. */ +# define FULL_STATE_SAVE_MASK \ + (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK) #else /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386 doesn't have red-zone, use 0 here. */ @@ -68,13 +76,17 @@ | (1 << X86_XSTATE_BNDREGS_ID) \ | (1 << X86_XSTATE_K_ID) \ | (1 << X86_XSTATE_ZMM_H_ID)) + +/* States to be included in xsave_state_size. */ +# define FULL_STATE_SAVE_MASK STATE_SAVE_MASK #endif /* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL. - Compiler assumes that all registers, including x87 FPU stack registers, - are unchanged after CALL, except for EFLAGS and RAX/EAX. */ + Compiler assumes that all registers, including AMX and x87 FPU + stack registers, are unchanged after CALL, except for EFLAGS and + RAX/EAX. */ #define TLSDESC_CALL_STATE_SAVE_MASK \ - (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID)) + (FULL_STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID)) /* Constants for bits in __x86_string_control: */ diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure index 418cc4a9b8..04a534fa12 100755 --- a/sysdeps/x86_64/configure +++ b/sysdeps/x86_64/configure @@ -134,6 +134,34 @@ fi config_vars="$config_vars enable-cet = $enable_cet" +# Check if -mamx-tile works properly. +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mamx-tile works properly" >&5 +printf %s "checking whether -mamx-tile works properly... " >&6; } +if test ${libc_cv_x86_have_amx_tile+y} +then : + printf %s "(cached) " >&6 +else $as_nop + cat > conftest.c < +EOF + libc_cv_x86_have_amx_tile=no + if { ac_try='${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + if grep -q __builtin_ia32_ldtilecfg conftest.i; then + libc_cv_x86_have_amx_tile=yes + fi + fi + rm -rf conftest* +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_amx_tile" >&5 +printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; } +config_vars="$config_vars +have-mamx-tile = $libc_cv_x86_have_amx_tile" + test -n "$critic_missing" && as_fn_error $? " *** $critic_missing" "$LINENO" 5 diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac index d1f803c02e..c714c47351 100644 --- a/sysdeps/x86_64/configure.ac +++ b/sysdeps/x86_64/configure.ac @@ -61,5 +61,20 @@ elif test $enable_cet = permissive; then fi LIBC_CONFIG_VAR([enable-cet], [$enable_cet]) +# Check if -mamx-tile works properly. +AC_CACHE_CHECK(whether -mamx-tile works properly, + libc_cv_x86_have_amx_tile, [dnl +cat > conftest.c < +EOF + libc_cv_x86_have_amx_tile=no + if AC_TRY_COMMAND(${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i); then + if grep -q __builtin_ia32_ldtilecfg conftest.i; then + libc_cv_x86_have_amx_tile=yes + fi + fi + rm -rf conftest*]) +LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile]) + test -n "$critic_missing" && AC_MSG_ERROR([ *** $critic_missing]) diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h index 0c2e8d5320..9f02cfc3eb 100644 --- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h @@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic: # endif #else /* Allocate stack space of the required size to save the state. */ - sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP + sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP #endif /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9, r10 and r11. */ -- cgit 1.4.1