summary refs log tree commit diff
path: root/sysdeps/x86_64/fpu/multiarch
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/fpu/multiarch')
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S56
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S98
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S180
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S310
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S80
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S152
6 files changed, 862 insertions, 14 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
index d37275d7ab..6dfc61ee93 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
@@ -20,7 +20,7 @@
 #include "svml_d_trig_data.h"
 
 	.text
-ENTRY (_ZGVbN2vvv_sincos_sse4)
+ENTRY (_ZGVbN2vl8l8_sincos_sse4)
 /*
    ALGORITHM DESCRIPTION:
 
@@ -311,4 +311,58 @@ ENTRY (_ZGVbN2vvv_sincos_sse4)
 
         movsd     %xmm0, 256(%rsp,%r15)
         jmp       .LBL_1_7
+END (_ZGVbN2vl8l8_sincos_sse4)
+libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4)
+
+/* vvv version implemented with wrapper to vl8l8 variant.  */
+ENTRY (_ZGVbN2vvv_sincos_sse4)
+#ifndef __ILP32__
+        subq      $72, %rsp
+        .cfi_def_cfa_offset 80
+        movdqu    %xmm1, 32(%rsp)
+        lea       (%rsp), %rdi
+        movdqu    %xmm2, 48(%rdi)
+        lea       16(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
+        movq      32(%rsp), %rdx
+        movq      48(%rsp), %rsi
+        movq      40(%rsp), %r8
+        movq      56(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      16(%rsp), %rcx
+        movq      8(%rsp), %rdi
+        movq      24(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        addq      $72, %rsp
+        .cfi_def_cfa_offset 8
+        ret
+#else
+        subl    $72, %esp
+        .cfi_def_cfa_offset 80
+        leal    48(%rsp), %esi
+        movaps  %xmm1, 16(%esp)
+        leal    32(%rsp), %edi
+        movaps  %xmm2, (%esp)
+        call    HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
+        movdqa  16(%esp), %xmm1
+        movsd   32(%esp), %xmm0
+        movq    %xmm1, %rax
+        movdqa  (%esp), %xmm2
+        movsd   %xmm0, (%eax)
+        movsd   40(%esp), %xmm0
+        pextrd  $1, %xmm1, %eax
+        movsd   %xmm0, (%eax)
+        movsd   48(%esp), %xmm0
+        movq    %xmm2, %rax
+        movsd   %xmm0, (%eax)
+        movsd   56(%esp), %xmm0
+        pextrd  $1, %xmm2, %eax
+        movsd   %xmm0, (%eax)
+        addl    $72, %esp
+        .cfi_def_cfa_offset 8
+        ret
+#endif
 END (_ZGVbN2vvv_sincos_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
index 24b57f4e8c..12f60100fa 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
@@ -20,7 +20,7 @@
 #include "svml_d_trig_data.h"
 
 	.text
-ENTRY (_ZGVdN4vvv_sincos_avx2)
+ENTRY (_ZGVdN4vl8l8_sincos_avx2)
 /*
    ALGORITHM DESCRIPTION:
 
@@ -274,4 +274,100 @@ ENTRY (_ZGVdN4vvv_sincos_avx2)
         vmovsd    %xmm0, 384(%rsp,%r15)
         jmp       .LBL_1_7
 
+END (_ZGVdN4vl8l8_sincos_avx2)
+libmvec_hidden_def(_ZGVdN4vl8l8_sincos_avx2)
+
+/* vvv version implemented with wrapper to vl8l8 variant.  */
+ENTRY (_ZGVdN4vvv_sincos_avx2)
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        subq      $128, %rsp
+        vmovdqu   %ymm1, 64(%rsp)
+        lea       (%rsp), %rdi
+        vmovdqu   %ymm2, 96(%rdi)
+        lea       32(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2)
+        movq      64(%rsp), %rdx
+        movq      96(%rsp), %rsi
+        movq      72(%rsp), %r8
+        movq      104(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      32(%rsp), %rcx
+        movq      8(%rsp), %rdi
+        movq      40(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      80(%rsp), %rax
+        movq      112(%rsp), %rcx
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        movq      88(%rsp), %rdi
+        movq      120(%rsp), %r9
+        movq      16(%rsp), %r11
+        movq      48(%rsp), %rdx
+        movq      24(%rsp), %rsi
+        movq      56(%rsp), %r8
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      %rsi, (%rdi)
+        movq      %r8, (%r9)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-32, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x78,0x6
+        leal    -48(%rbp), %esi
+        leal    -80(%rbp), %edi
+        subl    $104, %esp
+        vmovaps %xmm1, -96(%ebp)
+        vmovaps %xmm2, -112(%ebp)
+        call    HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2)
+        movl    -96(%ebp), %eax
+        vmovsd  -80(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -92(%ebp), %eax
+        vmovsd  -72(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -88(%ebp), %eax
+        vmovsd  -64(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -84(%ebp), %eax
+        vmovsd  -56(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -112(%ebp), %eax
+        vmovsd  -48(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -108(%ebp), %eax
+        vmovsd  -40(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -104(%ebp), %eax
+        vmovsd  -32(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movl    -100(%ebp), %eax
+        vmovsd  -24(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        addl    $104, %esp
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
 END (_ZGVdN4vvv_sincos_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
index 1d9f426d37..12ffb0ce9f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -36,9 +36,9 @@
      sin(R), sin(R') are approximated by corresponding polynomial.  */
 
 	.text
-ENTRY (_ZGVeN8vvv_sincos_knl)
+ENTRY (_ZGVeN8vl8l8_sincos_knl)
 #ifndef HAVE_AVX512_ASM_SUPPORT
-WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
 #else
         pushq     %rbp
         cfi_adjust_cfa_offset (8)
@@ -304,11 +304,12 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
         jmp       .LBL_1_7
 
 #endif
-END (_ZGVeN8vvv_sincos_knl)
+END (_ZGVeN8vl8l8_sincos_knl)
+libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl)
 
-ENTRY (_ZGVeN8vvv_sincos_skx)
+ENTRY (_ZGVeN8vl8l8_sincos_skx)
 #ifndef HAVE_AVX512_ASM_SUPPORT
-WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
 #else
         pushq     %rbp
         cfi_adjust_cfa_offset (8)
@@ -585,6 +586,175 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
         jmp       .LBL_2_7
 
 #endif
+END (_ZGVeN8vl8l8_sincos_skx)
+libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx)
+
+/* Wrapper between vvv and vl8l8 vector variants.  */
+.macro WRAPPER_AVX512_vvv_vl8l8 callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $256, %rsp
+        /* Encoding for vmovups %zmm1, 128(%rsp).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x4c
+        .byte 0x24
+        .byte 0x02
+        lea       (%rsp), %rdi
+        /* Encoding for vmovups %zmm2, 192(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x57
+        .byte 0x03
+        lea       64(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      128(%rsp), %rdx
+        movq      136(%rsp), %rsi
+        movq      144(%rsp), %r8
+        movq      152(%rsp), %r10
+        movq      (%rsp), %rax
+        movq      8(%rsp), %rcx
+        movq      16(%rsp), %rdi
+        movq      24(%rsp), %r9
+        movq      %rax, (%rdx)
+        movq      %rcx, (%rsi)
+        movq      160(%rsp), %rax
+        movq      168(%rsp), %rcx
+        movq      %rdi, (%r8)
+        movq      %r9, (%r10)
+        movq      176(%rsp), %rdi
+        movq      184(%rsp), %r9
+        movq      32(%rsp), %r11
+        movq      40(%rsp), %rdx
+        movq      48(%rsp), %rsi
+        movq      56(%rsp), %r8
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      192(%rsp), %r11
+        movq      200(%rsp), %rdx
+        movq      %rsi, (%rdi)
+        movq      %r8, (%r9)
+        movq      208(%rsp), %rsi
+        movq      216(%rsp), %r8
+        movq      64(%rsp), %r10
+        movq      72(%rsp), %rax
+        movq      80(%rsp), %rcx
+        movq      88(%rsp), %rdi
+        movq      %r10, (%r11)
+        movq      %rax, (%rdx)
+        movq      224(%rsp), %r10
+        movq      232(%rsp), %rax
+        movq      %rcx, (%rsi)
+        movq      %rdi, (%r8)
+        movq      240(%rsp), %rcx
+        movq      248(%rsp), %rdi
+        movq      96(%rsp), %r9
+        movq      104(%rsp), %r11
+        movq      112(%rsp), %rdx
+        movq      120(%rsp), %rsi
+        movq      %r9, (%r10)
+        movq      %r11, (%rax)
+        movq      %rdx, (%rcx)
+        movq      %rsi, (%rdi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-64, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x78,0x6
+        leal    -112(%rbp), %esi
+        leal    -176(%rbp), %edi
+        subl    $232, %esp
+        vmovdqa %ymm1, -208(%ebp)
+        vmovdqa %ymm2, -240(%ebp)
+        call    HIDDEN_JUMPTARGET(\callee)
+        vmovdqa -208(%ebp), %xmm0
+        vmovq   %xmm0, %rax
+        vmovsd  -176(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -168(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -200(%ebp), %rax
+        vmovsd  -160(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -152(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -192(%ebp), %rax
+        vmovsd  -144(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -136(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -184(%ebp), %rax
+        vmovsd  -128(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -120(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        vmovdqa -240(%ebp), %xmm0
+        vmovq   %xmm0, %rax
+        vmovsd  -112(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -104(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -232(%ebp), %rax
+        vmovsd  -96(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -88(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -224(%ebp), %rax
+        vmovsd  -80(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -72(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        movq    -216(%ebp), %rax
+        vmovsd  -64(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        shrq    $32, %rax
+        vmovsd  -56(%ebp), %xmm0
+        vmovsd  %xmm0, (%eax)
+        addl    $232, %esp
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVeN8vvv_sincos_knl)
+WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl
+END (_ZGVeN8vvv_sincos_knl)
+
+ENTRY (_ZGVeN8vvv_sincos_skx)
+WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
 END (_ZGVeN8vvv_sincos_skx)
 
 	.section .rodata, "a"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
index e375de8970..7621e87581 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -49,9 +49,9 @@
            R2 = XOR( RC, SC ).  */
 
 	.text
-ENTRY (_ZGVeN16vvv_sincosf_knl)
+ENTRY (_ZGVeN16vl4l4_sincosf_knl)
 #ifndef HAVE_AVX512_ASM_SUPPORT
-WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
 #else
         pushq     %rbp
         cfi_adjust_cfa_offset (8)
@@ -267,9 +267,10 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
         vmovss    %xmm0, 1280(%rsp,%r15,8)
         jmp       .LBL_1_7
 #endif
-END (_ZGVeN16vvv_sincosf_knl)
+END (_ZGVeN16vl4l4_sincosf_knl)
+libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl)
 
-ENTRY (_ZGVeN16vvv_sincosf_skx)
+ENTRY (_ZGVeN16vl4l4_sincosf_skx)
 #ifndef HAVE_AVX512_ASM_SUPPORT
 WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
 #else
@@ -496,6 +497,307 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
         vmovss    %xmm0, 1280(%rsp,%r15,8)
         jmp       .LBL_2_7
 #endif
+END (_ZGVeN16vl4l4_sincosf_skx)
+libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
+
+/* Wrapper between vvv and vl4l4 vector variants.  */
+.macro WRAPPER_AVX512_vvv_vl4l4 callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $384, %rsp
+        /* Encoding for vmovups %zmm1, 128(%rsp).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x4c
+        .byte 0x24
+        .byte 0x02
+        lea       (%rsp), %rdi
+        /* Encoding for vmovups %zmm2, 192(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x57
+        .byte 0x03
+        /* Encoding for vmovups %zmm3, 256(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x5f
+        .byte 0x04
+        /* Encoding for vmovups %zmm4, 320(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x67
+        .byte 0x05
+        lea       64(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      128(%rsp), %rdx
+        movq      136(%rsp), %rsi
+        movq      144(%rsp), %r8
+        movq      152(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      160(%rsp), %rax
+        movq      168(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      176(%rsp), %rdi
+        movq      184(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      192(%rsp), %r11
+        movq      200(%rsp), %rdx
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      208(%rsp), %rsi
+        movq      216(%rsp), %r8
+        movl      32(%rsp), %r10d
+        movl      36(%rsp), %eax
+        movl      40(%rsp), %ecx
+        movl      44(%rsp), %edi
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      224(%rsp), %r10
+        movq      232(%rsp), %rax
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movq      240(%rsp), %rcx
+        movq      248(%rsp), %rdi
+        movl      48(%rsp), %r9d
+        movl      52(%rsp), %r11d
+        movl      56(%rsp), %edx
+        movl      60(%rsp), %esi
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movq      256(%rsp), %r9
+        movq      264(%rsp), %r11
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movq      272(%rsp), %rdx
+        movq      280(%rsp), %rsi
+        movl      64(%rsp), %r8d
+        movl      68(%rsp), %r10d
+        movl      72(%rsp), %eax
+        movl      76(%rsp), %ecx
+        movl      %r8d, (%r9)
+        movl      %r10d, (%r11)
+        movq      288(%rsp), %r8
+        movq      296(%rsp), %r10
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      304(%rsp), %rax
+        movq      312(%rsp), %rcx
+        movl      80(%rsp), %edi
+        movl      84(%rsp), %r9d
+        movl      88(%rsp), %r11d
+        movl      92(%rsp), %edx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      320(%rsp), %rdi
+        movq      328(%rsp), %r9
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      336(%rsp), %r11
+        movq      344(%rsp), %rdx
+        movl      96(%rsp), %esi
+        movl      100(%rsp), %r8d
+        movl      104(%rsp), %r10d
+        movl      108(%rsp), %eax
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      352(%rsp), %rsi
+        movq      360(%rsp), %r8
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      368(%rsp), %r10
+        movq      376(%rsp), %rax
+        movl      112(%rsp), %ecx
+        movl      116(%rsp), %edi
+        movl      120(%rsp), %r9d
+        movl      124(%rsp), %r11d
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-64, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x78,0x6
+        leal    -112(%rbp), %esi
+        leal    -176(%rbp), %edi
+        subl    $296, %esp
+        /* Encoding for vmovdqa64 %zmm1, -240(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x7f
+        .byte 0x8d
+        .byte 0x10
+        .byte 0xff
+        .byte 0xff
+        .byte 0xff
+        /* Encoding for vmovdqa64 %zmm2, -304(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x7f
+        .byte 0x95
+        .byte 0xd0
+        .byte 0xfe
+        .byte 0xff
+        .byte 0xff
+        call    HIDDEN_JUMPTARGET(\callee)
+        movl    -240(%ebp), %eax
+        vmovss  -176(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -236(%ebp), %eax
+        vmovss  -172(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -232(%ebp), %eax
+        vmovss  -168(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -228(%ebp), %eax
+        vmovss  -164(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -224(%ebp), %eax
+        vmovss  -160(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -220(%ebp), %eax
+        vmovss  -156(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -216(%ebp), %eax
+        vmovss  -152(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -212(%ebp), %eax
+        vmovss  -148(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -208(%ebp), %eax
+        vmovss  -144(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -204(%ebp), %eax
+        vmovss  -140(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -200(%ebp), %eax
+        vmovss  -136(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -196(%ebp), %eax
+        vmovss  -132(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -192(%ebp), %eax
+        vmovss  -128(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -188(%ebp), %eax
+        vmovss  -124(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -184(%ebp), %eax
+        vmovss  -120(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -180(%ebp), %eax
+        vmovss  -116(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -304(%ebp), %eax
+        vmovss  -112(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -300(%ebp), %eax
+        vmovss  -108(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -296(%ebp), %eax
+        vmovss  -104(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -292(%ebp), %eax
+        vmovss  -100(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -288(%ebp), %eax
+        vmovss  -96(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -284(%ebp), %eax
+        vmovss  -92(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -280(%ebp), %eax
+        vmovss  -88(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -276(%ebp), %eax
+        vmovss  -84(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -272(%ebp), %eax
+        vmovss  -80(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -268(%ebp), %eax
+        vmovss  -76(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -264(%ebp), %eax
+        vmovss  -72(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -260(%ebp), %eax
+        vmovss  -68(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -256(%ebp), %eax
+        vmovss  -64(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -252(%ebp), %eax
+        vmovss  -60(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -248(%ebp), %eax
+        vmovss  -56(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -244(%ebp), %eax
+        vmovss  -52(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        addl    $296, %esp
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
+ENTRY (_ZGVeN16vvv_sincosf_knl)
+WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl
+END (_ZGVeN16vvv_sincosf_knl)
+
+ENTRY (_ZGVeN16vvv_sincosf_skx)
+WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
 END (_ZGVeN16vvv_sincosf_skx)
 
 	.section .rodata, "a"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
index 562367b136..5e8ea8bf76 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
@@ -20,7 +20,7 @@
 #include "svml_s_trig_data.h"
 
 	.text
-ENTRY (_ZGVbN4vvv_sincosf_sse4)
+ENTRY (_ZGVbN4vl4l4_sincosf_sse4)
 /*
    ALGORITHM DESCRIPTION:
 
@@ -265,4 +265,82 @@ ENTRY (_ZGVbN4vvv_sincosf_sse4)
         movss     %xmm0, 256(%rsp,%r15,8)
         jmp       .LBL_1_7
 
+END (_ZGVbN4vl4l4_sincosf_sse4)
+libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4)
+
+/* vvv version implemented with wrapper to vl4l4 variant.  */
+ENTRY (_ZGVbN4vvv_sincosf_sse4)
+#ifndef __ILP32__
+        subq      $104, %rsp
+        .cfi_def_cfa_offset 112
+        movdqu    %xmm1, 32(%rsp)
+        lea       (%rsp), %rdi
+        movdqu    %xmm2, 48(%rdi)
+        lea       16(%rsp), %rsi
+        movdqu    %xmm3, 48(%rsi)
+        movdqu    %xmm4, 64(%rsi)
+        call      HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
+        movq      32(%rsp), %rdx
+        movq      40(%rsp), %rsi
+        movq      48(%rsp), %r8
+        movq      56(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      64(%rsp), %rax
+        movq      72(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      80(%rsp), %rdi
+        movq      88(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        addq      $104, %rsp
+        .cfi_def_cfa_offset 8
+        ret
+#else
+        subl    $72, %esp
+        .cfi_def_cfa_offset 80
+        leal    48(%rsp), %esi
+        movaps  %xmm1, 16(%esp)
+        leal    32(%rsp), %edi
+        movaps  %xmm2, (%esp)
+        call    HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
+        movl    16(%esp), %eax
+        movss   32(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    20(%esp), %eax
+        movss   36(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    24(%esp), %eax
+        movss   40(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    28(%esp), %eax
+        movss   44(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    (%esp), %eax
+        movss   48(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    4(%esp), %eax
+        movss   52(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    8(%esp), %eax
+        movss   56(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        movl    12(%esp), %eax
+        movss   60(%esp), %xmm0
+        movss   %xmm0, (%eax)
+        addl    $72, %esp
+        .cfi_def_cfa_offset 8
+        ret
+#endif
 END (_ZGVbN4vvv_sincosf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
index baf887dd0a..75c28d1daa 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
@@ -20,7 +20,7 @@
 #include "svml_s_trig_data.h"
 
 	.text
-ENTRY(_ZGVdN8vvv_sincosf_avx2)
+ENTRY (_ZGVdN8vl4l4_sincosf_avx2)
 /*
    ALGORITHM DESCRIPTION:
 
@@ -238,4 +238,152 @@ ENTRY(_ZGVdN8vvv_sincosf_avx2)
         vmovss    %xmm0, 384(%rsp,%r15,8)
         jmp       .LBL_1_7
 
-END(_ZGVdN8vvv_sincosf_avx2)
+END (_ZGVdN8vl4l4_sincosf_avx2)
+libmvec_hidden_def(_ZGVdN8vl4l4_sincosf_avx2)
+
+/* vvv version implemented with wrapper to vl4l4 variant.  */
+ENTRY (_ZGVdN8vvv_sincosf_avx2)
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-32, %rsp
+        subq      $192, %rsp
+        vmovdqu   %ymm1, 64(%rsp)
+        lea       (%rsp), %rdi
+        vmovdqu   %ymm2, 96(%rdi)
+        vmovdqu   %ymm3, 128(%rdi)
+        vmovdqu   %ymm4, 160(%rdi)
+        lea       32(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2)
+        movq      64(%rsp), %rdx
+        movq      72(%rsp), %rsi
+        movq      80(%rsp), %r8
+        movq      88(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      96(%rsp), %rax
+        movq      104(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      112(%rsp), %rdi
+        movq      120(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      128(%rsp), %r11
+        movq      136(%rsp), %rdx
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      144(%rsp), %rsi
+        movq      152(%rsp), %r8
+        movl      32(%rsp), %r10d
+        movl      36(%rsp), %eax
+        movl      40(%rsp), %ecx
+        movl      44(%rsp), %edi
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      160(%rsp), %r10
+        movq      168(%rsp), %rax
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movq      176(%rsp), %rcx
+        movq      184(%rsp), %rdi
+        movl      48(%rsp), %r9d
+        movl      52(%rsp), %r11d
+        movl      56(%rsp), %edx
+        movl      60(%rsp), %esi
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-32, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x78,0x6
+        leal    -48(%rbp), %esi
+        leal    -80(%rbp), %edi
+        subl    $136, %esp
+        vmovdqa %ymm1, -112(%ebp)
+        vmovdqa %ymm2, -144(%ebp)
+        call    HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2)
+        vmovdqa -112(%ebp), %xmm0
+        vmovq   %xmm0, %rax
+        vmovss  -80(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -76(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -104(%ebp), %rax
+        vmovss  -72(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -68(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -96(%ebp), %rax
+        vmovss  -64(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -60(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -88(%ebp), %rax
+        vmovss  -56(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -52(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        vmovdqa -144(%ebp), %xmm0
+        vmovq   %xmm0, %rax
+        vmovss  -48(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -44(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -136(%ebp), %rax
+        vmovss  -40(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -36(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -128(%ebp), %rax
+        vmovss  -32(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -28(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        movq    -120(%ebp), %rax
+        vmovss  -24(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        vmovss  -20(%ebp), %xmm0
+        shrq    $32, %rax
+        vmovss  %xmm0, (%eax)
+        addl    $136, %esp
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+END (_ZGVdN8vvv_sincosf_avx2)