diff options
author | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-07-24 14:47:23 +0300 |
---|---|---|
committer | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-07-24 14:47:23 +0300 |
commit | 99017161354321845d11dce4fcd3abfebc5dd0d5 (patch) | |
tree | 50c62fe44aef915a84b1eb5fb0ad787e39f5a210 /sysdeps/x86_64/fpu/svml_s_wrapper_impl.h | |
parent | 3bcea719ddd6ce399d7bccb492c40af77d216e42 (diff) | |
download | glibc-99017161354321845d11dce4fcd3abfebc5dd0d5.tar.gz glibc-99017161354321845d11dce4fcd3abfebc5dd0d5.tar.xz glibc-99017161354321845d11dce4fcd3abfebc5dd0d5.zip |
Fixed several libmvec bugs found during testing on KNL hardware.
AVX512 IFUNC implementations, implementations of wrappers to AVX2 versions and KNL expf implementation fixed. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC. * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL implementation.
Diffstat (limited to 'sysdeps/x86_64/fpu/svml_s_wrapper_impl.h')
-rw-r--r-- | sysdeps/x86_64/fpu/svml_s_wrapper_impl.h | 101 |
1 files changed, 62 insertions, 39 deletions
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h index 66bb081c9d..d255d195ee 100644 --- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h @@ -239,28 +239,39 @@ /* AVX512 ISA version as wrapper to AVX2 ISA version. */ .macro WRAPPER_IMPL_AVX512 callee - pushq %rbp + pushq %rbp cfi_adjust_cfa_offset (8) cfi_rel_offset (%rbp, 0) - movq %rsp, %rbp + movq %rsp, %rbp cfi_def_cfa_register (%rbp) - andq $-64, %rsp - subq $64, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 - vmovaps (%rsp), %ymm0 - call HIDDEN_JUMPTARGET(\callee) - vmovaps 32(%rsp), %ymm0 - call HIDDEN_JUMPTARGET(\callee) - movq %rbp, %rsp + andq $-64, %rsp + subq $128, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 + vmovupd (%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 64(%rsp) + vmovupd 32(%rsp), %ymm0 + call HIDDEN_JUMPTARGET(\callee) + vmovupd %ymm0, 96(%rsp) +/* Below is encoding for vmovups 64(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x01 + movq %rbp, %rsp cfi_def_cfa_register (%rsp) - popq %rbp + popq %rbp cfi_adjust_cfa_offset (-8) cfi_restore (%rbp) ret @@ -274,29 +285,41 @@ movq %rsp, %rbp cfi_def_cfa_register (%rbp) andq $-64, %rsp - subq $128, %rsp -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovaps %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x4c - .byte 0x24 - vmovaps (%rsp), %ymm0 - vmovaps 64(%rsp), %ymm1 + subq $192, %rsp +/* Below is encoding for vmovups %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovups %zmm1, 64(%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x11 + .byte 0x4c + .byte 0x24 + .byte 0x01 + vmovups (%rsp), %ymm0 + vmovups 64(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) - vmovaps 32(%rsp), %ymm0 - vmovaps 96(%rsp), %ymm1 + vmovups %ymm0, 128(%rsp) + vmovups 32(%rsp), %ymm0 + vmovups 96(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) + vmovups %ymm0, 160(%rsp) +/* Below is encoding for vmovups 128(%rsp), %zmm0. */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x10 + .byte 0x44 + .byte 0x24 + .byte 0x02 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp |