diff options
author | Andrew Senkevich <andrew.senkevich@intel.com> | 2016-07-01 14:15:38 +0300 |
---|---|---|
committer | Andrew Senkevich <andrew.senkevich@intel.com> | 2016-07-01 14:15:38 +0300 |
commit | ee2196bb6766ca7e63a1ba22ebb7619a3266776a (patch) | |
tree | a99accc0d97a405f535249efd7657de270726850 /sysdeps/x86_64/fpu/svml_s_sincosf8_core.S | |
parent | fd1cf1dc3b2d90c2a61332363feb1043f6916564 (diff) | |
download | glibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.tar.gz glibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.tar.xz glibc-ee2196bb6766ca7e63a1ba22ebb7619a3266776a.zip |
Fixed wrong vector sincos/sincosf ABI to have it compatible with
current vector function declaration "#pragma omp declare simd notinbranch", according to which vector sincos should have vector of pointers for second and third parameters. It is fixed with implementation as wrapper to version having second and third parameters as pointers. [BZ #20024] * sysdeps/x86/fpu/test-math-vector-sincos.h: New. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S: Fixed ABI of this implementation of vector function. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S: Likewise. * sysdeps/x86_64/fpu/svml_d_sincos2_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_sincos4_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S: Likewise. * sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf4_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf8_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S: Likewise. * sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c: Use another wrapper for testing vector sincos with fixed ABI. * sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c: Likewise. * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c: New test. * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c: Likewise. * sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c: Likewise. * sysdeps/x86_64/fpu/test-double-libmvec-sincos.c: Likewise. * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c: Likewise. * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c: Likewise. * sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c: Likewise. * sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c: Likewise. * sysdeps/x86_64/fpu/Makefile: Added new tests.
Diffstat (limited to 'sysdeps/x86_64/fpu/svml_s_sincosf8_core.S')
-rw-r--r-- | sysdeps/x86_64/fpu/svml_s_sincosf8_core.S | 173 |
1 files changed, 172 insertions, 1 deletions
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S index 74d1dfd1a8..93ac91608f 100644 --- a/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S @@ -20,8 +20,179 @@ #include "svml_s_wrapper_impl.h" .text +ENTRY (_ZGVdN8vl4l4_sincosf) +WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf +END (_ZGVdN8vl4l4_sincosf) +libmvec_hidden_def (_ZGVdN8vl4l4_sincosf) + +/* AVX2 ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX2_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $224, %rsp + vmovups %ymm0, 192(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, 160(%rdi) + lea 32(%rsp), %rsi + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovups 208(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 64(%rsp), %rdx + movq 72(%rsp), %rsi + movq 80(%rsp), %r8 + movq 88(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 96(%rsp), %rax + movq 104(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 112(%rsp), %rdi + movq 120(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 128(%rsp), %r11 + movq 136(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 144(%rsp), %rsi + movq 152(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 160(%rsp), %r10 + movq 168(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 176(%rsp), %rcx + movq 184(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $184, %esp + vmovdqa %ymm1, -144(%ebp) + vmovdqa %ymm2, -176(%ebp) + vmovaps %ymm0, -208(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovups -192(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movl -144(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -140(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -136(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -132(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -128(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -124(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -120(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -116(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -176(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -172(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -168(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -164(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -160(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -156(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -152(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -148(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $184, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + ENTRY (_ZGVdN8vvv_sincosf) -WRAPPER_IMPL_AVX_fFF _ZGVbN4vvv_sincosf +WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN4vl4l4_sincosf END (_ZGVdN8vvv_sincosf) #ifndef USE_MULTIARCH |