about summary refs log tree commit diff
path: root/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
diff options
context:
space:
mode:
authorAndrew Senkevich <andrew.senkevich@intel.com>2016-07-01 14:15:38 +0300
committerAndrew Senkevich <andrew.senkevich@intel.com>2016-07-01 14:15:38 +0300
commitee2196bb6766ca7e63a1ba22ebb7619a3266776a (patch)
treea99accc0d97a405f535249efd7657de270726850 /sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
parentfd1cf1dc3b2d90c2a61332363feb1043f6916564 (diff)
downloadglibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.tar.gz
glibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.tar.xz
glibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.zip
Fixed wrong vector sincos/sincosf ABI to have it compatible with
current vector function declaration "#pragma omp declare simd notinbranch",
according to which vector sincos should have vector of pointers for second and
third parameters. It is fixed with implementation as wrapper to version
having second and third parameters as pointers.

    [BZ #20024]
    * sysdeps/x86/fpu/test-math-vector-sincos.h: New.
    * sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S: Fixed ABI
    of this implementation of vector function.
    * sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S:
    Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S: Likewise.
    * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S: Likewise.
    * sysdeps/x86_64/fpu/svml_d_sincos2_core.S: Likewise.
    * sysdeps/x86_64/fpu/svml_d_sincos4_core.S: Likewise.
    * sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S: Likewise.
    * sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Likewise.
    * sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise.
    * sysdeps/x86_64/fpu/svml_s_sincosf4_core.S: Likewise.
    * sysdeps/x86_64/fpu/svml_s_sincosf8_core.S: Likewise.
    * sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S: Likewise.
    * sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c: Use another wrapper
    for testing vector sincos with fixed ABI.
    * sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c: Likewise.
    * sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c: Likewise.
    * sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: Likewise.
    * sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c: Likewise.
    * sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c: Likewise.
    * sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c: Likewise.
    * sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c: Likewise.
    * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c: New test.
    * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c: Likewise.
    * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c: Likewise.
    * sysdeps/x86_64/fpu/test-double-libmvec-sincos.c: Likewise.
    * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c: Likewise.
    * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c: Likewise.
    * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c: Likewise.
    * sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c: Likewise.
    * sysdeps/x86_64/fpu/Makefile: Added new tests.
Diffstat (limited to 'sysdeps/x86_64/fpu/svml_s_sincosf16_core.S')
-rw-r--r--sysdeps/x86_64/fpu/svml_s_sincosf16_core.S335
1 files changed, 334 insertions, 1 deletions
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
index 5cbf10b8da..aae1adb8d7 100644
--- a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
@@ -20,6 +20,339 @@
 #include "svml_s_wrapper_impl.h"
 
 	.text
+ENTRY (_ZGVeN16vl4l4_sincosf)
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
+END (_ZGVeN16vl4l4_sincosf)
+
+/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector
+   function declared with #pragma omp declare simd notinbranch).  */
+.macro WRAPPER_IMPL_AVX512_fFF_vvv callee
+#ifndef __ILP32__
+        pushq     %rbp
+        cfi_adjust_cfa_offset (8)
+        cfi_rel_offset (%rbp, 0)
+        movq      %rsp, %rbp
+        cfi_def_cfa_register (%rbp)
+        andq      $-64, %rsp
+        subq      $448, %rsp
+        /* Encoding for vmovups %zmm0, 384(%rsp).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x44
+        .byte 0x24
+        .byte 0x06
+        lea       (%rsp), %rdi
+        /* Encoding for vmovups %zmm1, 128(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x4f
+        .byte 0x02
+        /* Encoding for vmovups %zmm2, 192(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x57
+        .byte 0x03
+        /* Encoding for vmovups %zmm3, 256(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x5f
+        .byte 0x04
+        /* Encoding for vmovups %zmm4, 320(%rdi).  */
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x11
+        .byte 0x67
+        .byte 0x05
+        lea       64(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovdqu   416(%rsp), %ymm0
+        lea       32(%rsp), %rdi
+        lea       96(%rsp), %rsi
+        call      HIDDEN_JUMPTARGET(\callee)
+        movq      128(%rsp), %rdx
+        movq      136(%rsp), %rsi
+        movq      144(%rsp), %r8
+        movq      152(%rsp), %r10
+        movl      (%rsp), %eax
+        movl      4(%rsp), %ecx
+        movl      8(%rsp), %edi
+        movl      12(%rsp), %r9d
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      160(%rsp), %rax
+        movq      168(%rsp), %rcx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      176(%rsp), %rdi
+        movq      184(%rsp), %r9
+        movl      16(%rsp), %r11d
+        movl      20(%rsp), %edx
+        movl      24(%rsp), %esi
+        movl      28(%rsp), %r8d
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      192(%rsp), %r11
+        movq      200(%rsp), %rdx
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      208(%rsp), %rsi
+        movq      216(%rsp), %r8
+        movl      32(%rsp), %r10d
+        movl      36(%rsp), %eax
+        movl      40(%rsp), %ecx
+        movl      44(%rsp), %edi
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      224(%rsp), %r10
+        movq      232(%rsp), %rax
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movq      240(%rsp), %rcx
+        movq      248(%rsp), %rdi
+        movl      48(%rsp), %r9d
+        movl      52(%rsp), %r11d
+        movl      56(%rsp), %edx
+        movl      60(%rsp), %esi
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movq      256(%rsp), %r9
+        movq      264(%rsp), %r11
+        movl      %edx, (%rcx)
+        movl      %esi, (%rdi)
+        movq      272(%rsp), %rdx
+        movq      280(%rsp), %rsi
+        movl      64(%rsp), %r8d
+        movl      68(%rsp), %r10d
+        movl      72(%rsp), %eax
+        movl      76(%rsp), %ecx
+        movl      %r8d, (%r9)
+        movl      %r10d, (%r11)
+        movq      288(%rsp), %r8
+        movq      296(%rsp), %r10
+        movl      %eax, (%rdx)
+        movl      %ecx, (%rsi)
+        movq      304(%rsp), %rax
+        movq      312(%rsp), %rcx
+        movl      80(%rsp), %edi
+        movl      84(%rsp), %r9d
+        movl      88(%rsp), %r11d
+        movl      92(%rsp), %edx
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movq      320(%rsp), %rdi
+        movq      328(%rsp), %r9
+        movl      %r11d, (%rax)
+        movl      %edx, (%rcx)
+        movq      336(%rsp), %r11
+        movq      344(%rsp), %rdx
+        movl      96(%rsp), %esi
+        movl      100(%rsp), %r8d
+        movl      104(%rsp), %r10d
+        movl      108(%rsp), %eax
+        movl      %esi, (%rdi)
+        movl      %r8d, (%r9)
+        movq      352(%rsp), %rsi
+        movq      360(%rsp), %r8
+        movl      %r10d, (%r11)
+        movl      %eax, (%rdx)
+        movq      368(%rsp), %r10
+        movq      376(%rsp), %rax
+        movl      112(%rsp), %ecx
+        movl      116(%rsp), %edi
+        movl      120(%rsp), %r9d
+        movl      124(%rsp), %r11d
+        movl      %ecx, (%rsi)
+        movl      %edi, (%r8)
+        movl      %r9d, (%r10)
+        movl      %r11d, (%rax)
+        movq      %rbp, %rsp
+        cfi_def_cfa_register (%rsp)
+        popq      %rbp
+        cfi_adjust_cfa_offset (-8)
+        cfi_restore (%rbp)
+        ret
+#else
+        leal    8(%rsp), %r10d
+        .cfi_def_cfa 10, 0
+        andl    $-64, %esp
+        pushq   -8(%r10d)
+        pushq   %rbp
+        .cfi_escape 0x10,0x6,0x2,0x76,0
+        movl    %esp, %ebp
+        pushq   %r12
+        leal    -112(%rbp), %esi
+        pushq   %r10
+        .cfi_escape 0xf,0x3,0x76,0x70,0x6
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+        leal    -176(%rbp), %edi
+        movq    %rsi, %r12
+        pushq   %rbx
+        .cfi_escape 0x10,0x3,0x2,0x76,0x68
+        movq    %rdi, %rbx
+        subl    $344, %esp
+        /* Encoding for vmovdqa64 %zmm1, -240(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x7f
+        .byte 0x8d
+        .byte 0x10
+        .byte 0xff
+        .byte 0xff
+        .byte 0xff
+        /* Encoding for vmovdqa64 %zmm2, -304(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0xfd
+        .byte 0x48
+        .byte 0x7f
+        .byte 0x95
+        .byte 0xd0
+        .byte 0xfe
+        .byte 0xff
+        .byte 0xff
+        /* Encoding for vmovaps %zmm0, -368(%ebp).  */
+        .byte 0x67
+        .byte 0x62
+        .byte 0xf1
+        .byte 0x7c
+        .byte 0x48
+        .byte 0x29
+        .byte 0x85
+        .byte 0x90
+        .byte 0xfe
+        .byte 0xff
+        .byte 0xff
+        call    HIDDEN_JUMPTARGET(\callee)
+        leal    32(%r12), %esi
+        vmovups -336(%ebp), %ymm0
+        leal    32(%rbx), %edi
+        call    HIDDEN_JUMPTARGET(\callee)
+        movl    -240(%ebp), %eax
+        vmovss  -176(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -236(%ebp), %eax
+        vmovss  -172(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -232(%ebp), %eax
+        vmovss  -168(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -228(%ebp), %eax
+        vmovss  -164(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -224(%ebp), %eax
+        vmovss  -160(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -220(%ebp), %eax
+        vmovss  -156(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -216(%ebp), %eax
+        vmovss  -152(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -212(%ebp), %eax
+        vmovss  -148(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -208(%ebp), %eax
+        vmovss  -144(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -204(%ebp), %eax
+        vmovss  -140(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -200(%ebp), %eax
+        vmovss  -136(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -196(%ebp), %eax
+        vmovss  -132(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -192(%ebp), %eax
+        vmovss  -128(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -188(%ebp), %eax
+        vmovss  -124(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -184(%ebp), %eax
+        vmovss  -120(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -180(%ebp), %eax
+        vmovss  -116(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -304(%ebp), %eax
+        vmovss  -112(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -300(%ebp), %eax
+        vmovss  -108(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -296(%ebp), %eax
+        vmovss  -104(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -292(%ebp), %eax
+        vmovss  -100(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -288(%ebp), %eax
+        vmovss  -96(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -284(%ebp), %eax
+        vmovss  -92(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -280(%ebp), %eax
+        vmovss  -88(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -276(%ebp), %eax
+        vmovss  -84(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -272(%ebp), %eax
+        vmovss  -80(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -268(%ebp), %eax
+        vmovss  -76(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -264(%ebp), %eax
+        vmovss  -72(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -260(%ebp), %eax
+        vmovss  -68(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -256(%ebp), %eax
+        vmovss  -64(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -252(%ebp), %eax
+        vmovss  -60(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -248(%ebp), %eax
+        vmovss  -56(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        movl    -244(%ebp), %eax
+        vmovss  -52(%ebp), %xmm0
+        vmovss  %xmm0, (%eax)
+        addl    $344, %esp
+        popq    %rbx
+        popq    %r10
+        .cfi_def_cfa 10, 0
+        popq    %r12
+        popq    %rbp
+        leal    -8(%r10), %esp
+        .cfi_def_cfa 7, 8
+        ret
+#endif
+.endm
+
 ENTRY (_ZGVeN16vvv_sincosf)
-WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf
 END (_ZGVeN16vvv_sincosf)