diff options
author | Andrew Senkevich <andrew.senkevich@intel.com> | 2016-07-01 14:15:38 +0300 |
---|---|---|
committer | Andrew Senkevich <andrew.senkevich@intel.com> | 2016-07-01 14:15:38 +0300 |
commit | ee2196bb6766ca7e63a1ba22ebb7619a3266776a (patch) | |
tree | a99accc0d97a405f535249efd7657de270726850 /sysdeps/x86_64/fpu/svml_s_sincosf16_core.S | |
parent | fd1cf1dc3b2d90c2a61332363feb1043f6916564 (diff) | |
download | glibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.tar.gz glibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.tar.xz glibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.zip |
Fixed wrong vector sincos/sincosf ABI to have it compatible with
current vector function declaration "#pragma omp declare simd notinbranch", according to which vector sincos should have vector of pointers for second and third parameters. It is fixed with implementation as wrapper to version having second and third parameters as pointers. [BZ #20024] * sysdeps/x86/fpu/test-math-vector-sincos.h: New. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S: Fixed ABI of this implementation of vector function. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S: Likewise. * sysdeps/x86_64/fpu/svml_d_sincos2_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_sincos4_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S: Likewise. * sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf4_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf8_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S: Likewise. * sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c: Use another wrapper for testing vector sincos with fixed ABI. * sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c: New test. * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c: Likewise. * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c: Likewise. * sysdeps/x86_64/fpu/test-double-libmvec-sincos.c: Likewise. * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c: Likewise. * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c: Likewise. * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c: Likewise. * sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c: Likewise. * sysdeps/x86_64/fpu/Makefile: Added new tests.
Diffstat (limited to 'sysdeps/x86_64/fpu/svml_s_sincosf16_core.S')
-rw-r--r-- | sysdeps/x86_64/fpu/svml_s_sincosf16_core.S | 335 |
1 files changed, 334 insertions, 1 deletions
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S index 5cbf10b8da..aae1adb8d7 100644 --- a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S @@ -20,6 +20,339 @@ #include "svml_s_wrapper_impl.h" .text +ENTRY (_ZGVeN16vl4l4_sincosf) +WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf +END (_ZGVeN16vl4l4_sincosf) + +/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX512_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + /* Encoding for vmovups %zmm0, 384(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x44 + .byte 0x24 + .byte 0x06 + lea (%rsp), %rdi + /* Encoding for vmovups %zmm1, 128(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4f + .byte 0x02 + /* Encoding for vmovups %zmm2, 192(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x57 + .byte 0x03 + /* Encoding for vmovups %zmm3, 256(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x5f + .byte 0x04 + /* Encoding for vmovups %zmm4, 320(%rdi). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x67 + .byte 0x05 + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 416(%rsp), %ymm0 + lea 32(%rsp), %rdi + lea 96(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movq 256(%rsp), %r9 + movq 264(%rsp), %r11 + movl %edx, (%rcx) + movl %esi, (%rdi) + movq 272(%rsp), %rdx + movq 280(%rsp), %rsi + movl 64(%rsp), %r8d + movl 68(%rsp), %r10d + movl 72(%rsp), %eax + movl 76(%rsp), %ecx + movl %r8d, (%r9) + movl %r10d, (%r11) + movq 288(%rsp), %r8 + movq 296(%rsp), %r10 + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 304(%rsp), %rax + movq 312(%rsp), %rcx + movl 80(%rsp), %edi + movl 84(%rsp), %r9d + movl 88(%rsp), %r11d + movl 92(%rsp), %edx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 320(%rsp), %rdi + movq 328(%rsp), %r9 + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 336(%rsp), %r11 + movq 344(%rsp), %rdx + movl 96(%rsp), %esi + movl 100(%rsp), %r8d + movl 104(%rsp), %r10d + movl 108(%rsp), %eax + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 352(%rsp), %rsi + movq 360(%rsp), %r8 + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 368(%rsp), %r10 + movq 376(%rsp), %rax + movl 112(%rsp), %ecx + movl 116(%rsp), %edi + movl 120(%rsp), %r9d + movl 124(%rsp), %r11d + movl %ecx, (%rsi) + movl %edi, (%r8) + movl %r9d, (%r10) + movl %r11d, (%rax) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -112(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -176(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $344, %esp + /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x7f + .byte 0x8d + .byte 0x10 + .byte 0xff + .byte 0xff + .byte 0xff + /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0xfd + .byte 0x48 + .byte 0x7f + .byte 0x95 + .byte 0xd0 + .byte 0xfe + .byte 0xff + .byte 0xff + /* Encoding for vmovaps %zmm0, -368(%ebp). */ + .byte 0x67 + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x29 + .byte 0x85 + .byte 0x90 + .byte 0xfe + .byte 0xff + .byte 0xff + call HIDDEN_JUMPTARGET(\callee) + leal 32(%r12), %esi + vmovups -336(%ebp), %ymm0 + leal 32(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movl -240(%ebp), %eax + vmovss -176(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -236(%ebp), %eax + vmovss -172(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -232(%ebp), %eax + vmovss -168(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -228(%ebp), %eax + vmovss -164(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -224(%ebp), %eax + vmovss -160(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -220(%ebp), %eax + vmovss -156(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -216(%ebp), %eax + vmovss -152(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -212(%ebp), %eax + vmovss -148(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -208(%ebp), %eax + vmovss -144(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -204(%ebp), %eax + vmovss -140(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -200(%ebp), %eax + vmovss -136(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -196(%ebp), %eax + vmovss -132(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -192(%ebp), %eax + vmovss -128(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -188(%ebp), %eax + vmovss -124(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -184(%ebp), %eax + vmovss -120(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -180(%ebp), %eax + vmovss -116(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -304(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -300(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -296(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -292(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -288(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -284(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -280(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -276(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -272(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -268(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -264(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -260(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -256(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -252(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -248(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -244(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $344, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + ENTRY (_ZGVeN16vvv_sincosf) -WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf +WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf END (_ZGVeN16vvv_sincosf) |