about summary refs log tree commit diff
path: root/sysdeps/x86
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2021-10-29 12:40:20 -0700
committerH.J. Lu <hjl.tools@gmail.com>2021-11-01 07:52:56 -0700
commitc46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 (patch)
tree92623deb486f7d21b4f912bd3faa04d5a7a4a9f7 /sysdeps/x86
parent79d0fc65395716c1d95931064c7bf37852203c66 (diff)
downloadglibc-c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55.tar.gz
glibc-c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55.tar.xz
glibc-c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55.zip
x86-64: Improve EVEX strcmp with masked load
In strcmp-evex.S, to compare 2 32-byte strings, replace

        VMOVU   (%rdi, %rdx), %YMM0
        VMOVU   (%rsi, %rdx), %YMM1
        /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
        VPCMP   $4, %YMM0, %YMM1, %k0
        VPCMP   $0, %YMMZERO, %YMM0, %k1
        VPCMP   $0, %YMMZERO, %YMM1, %k2
        /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
        kord    %k1, %k2, %k1
        /* Each bit in K1 represents a NULL or a mismatch.  */
        kord    %k0, %k1, %k1
        kmovd   %k1, %ecx
        testl   %ecx, %ecx
        jne     L(last_vector)

with

        VMOVU   (%rdi, %rdx), %YMM0
        VPTESTM %YMM0, %YMM0, %k2
        /* Each bit cleared in K1 represents a mismatch or a null CHAR
           in YMM0 and 32 bytes at (%rsi, %rdx).  */
        VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
        kmovd   %k1, %ecx
        incl    %ecx
        jne     L(last_vector)

It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.

Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
Diffstat (limited to 'sysdeps/x86')
0 files changed, 0 insertions, 0 deletions