diff options
Diffstat (limited to 'sysdeps/x86_64/memchr.S')
-rw-r--r-- | sysdeps/x86_64/memchr.S | 78 |
1 files changed, 68 insertions, 10 deletions
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index 891ee70aef..205345b43d 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -20,8 +20,17 @@ /* fast SSE2 version with using pmaxub and 64 byte loop */ +# ifdef __CHKP__ +# define RETURN \ + bndcu (%rax), %bnd0; \ + ret +# else +# define RETURN ret +# endif + .text ENTRY(memchr) + movd %rsi, %xmm1 mov %rdi, %rcx @@ -33,6 +42,10 @@ ENTRY(memchr) and $63, %rcx pshufd $0, %xmm1, %xmm1 +#ifdef __CHKP__ + bndcl (%rdi), %bnd0 + bndcu (%rdi), %bnd0 +#endif cmp $48, %rcx ja L(crosscache) @@ -72,7 +85,7 @@ L(crosscache): jbe L(return_null) add %rdi, %rax add %rcx, %rax - ret + RETURN .p2align 4 L(unaligned_no_match): @@ -85,24 +98,36 @@ L(unaligned_no_match): .p2align 4 L(loop_prolog): +#ifdef __CHKP__ + bndcu (%rdi), %bnd0 +#endif movdqa (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) +#ifdef __CHKP__ + bndcu 16(%rdi), %bnd0 +#endif movdqa 16(%rdi), %xmm2 pcmpeqb %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) +#ifdef __CHKP__ + bndcu 32(%rdi), %bnd0 +#endif movdqa 32(%rdi), %xmm3 pcmpeqb %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) +#ifdef __CHKP__ + bndcu 48(%rdi), %bnd0 +#endif movdqa 48(%rdi), %xmm4 pcmpeqb %xmm1, %xmm4 add $64, %rdi @@ -116,24 +141,36 @@ L(loop_prolog): sub $64, %rdx jbe L(exit_loop) +#ifdef __CHKP__ + bndcu (%rdi), %bnd0 +#endif movdqa (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) +#ifdef __CHKP__ + bndcu 16(%rdi), %bnd0 +#endif movdqa 16(%rdi), %xmm2 pcmpeqb %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) +#ifdef __CHKP__ + bndcu 32(%rdi), %bnd0 +#endif movdqa 32(%rdi), %xmm3 pcmpeqb %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) +#ifdef __CHKP__ + bndcu 48(%rdi), %bnd0 +#endif movdqa 48(%rdi), %xmm3 pcmpeqb %xmm1, %xmm3 pmovmskb %xmm3, %eax @@ -151,6 +188,9 @@ L(loop_prolog): L(align64_loop): sub $64, %rdx jbe L(exit_loop) +#ifdef __CHKP__ + bndcu (%rdi), %bnd0 +#endif movdqa (%rdi), %xmm0 movdqa 16(%rdi), %xmm2 movdqa 32(%rdi), %xmm3 @@ -192,25 +232,34 @@ L(align64_loop): pmovmskb %xmm1, %eax bsf %eax, %eax lea 48(%rdi, %rax), %rax - ret + RETURN .p2align 4 L(exit_loop): add $32, %rdx jle L(exit_loop_32) +#ifdef __CHKP__ + bndcu (%rdi), %bnd0 +#endif movdqa (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) +#ifdef __CHKP__ + bndcu 16(%rdi), %bnd0 +#endif movdqa 16(%rdi), %xmm2 pcmpeqb %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) +#ifdef __CHKP__ + bndcu 32(%rdi), %bnd0 +#endif movdqa 32(%rdi), %xmm3 pcmpeqb %xmm1, %xmm3 pmovmskb %xmm3, %eax @@ -219,6 +268,9 @@ L(exit_loop): sub $16, %rdx jle L(return_null) +#ifdef __CHKP__ + bndcu 48(%rdi), %bnd0 +#endif pcmpeqb 48(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax @@ -229,6 +281,9 @@ L(exit_loop): .p2align 4 L(exit_loop_32): add $32, %rdx +#ifdef __CHKP__ + bndcu (%rdi), %bnd0 +#endif movdqa (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %eax @@ -237,6 +292,9 @@ L(exit_loop_32): sub $16, %rdx jbe L(return_null) +#ifdef __CHKP__ + bndcu 16(%rdi), %bnd0 +#endif pcmpeqb 16(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax @@ -248,25 +306,25 @@ L(exit_loop_32): L(matches0): bsf %eax, %eax lea -16(%rax, %rdi), %rax - ret + RETURN .p2align 4 L(matches): bsf %eax, %eax add %rdi, %rax - ret + RETURN .p2align 4 L(matches16): bsf %eax, %eax lea 16(%rax, %rdi), %rax - ret + RETURN .p2align 4 L(matches32): bsf %eax, %eax lea 32(%rax, %rdi), %rax - ret + RETURN .p2align 4 L(matches_1): @@ -274,7 +332,7 @@ L(matches_1): sub %rax, %rdx jbe L(return_null) add %rdi, %rax - ret + RETURN .p2align 4 L(matches16_1): @@ -282,7 +340,7 @@ L(matches16_1): sub %rax, %rdx jbe L(return_null) lea 16(%rdi, %rax), %rax - ret + RETURN .p2align 4 L(matches32_1): @@ -290,7 +348,7 @@ L(matches32_1): sub %rax, %rdx jbe L(return_null) lea 32(%rdi, %rax), %rax - ret + RETURN .p2align 4 L(matches48_1): @@ -298,7 +356,7 @@ L(matches48_1): sub %rax, %rdx jbe L(return_null) lea 48(%rdi, %rax), %rax - ret + RETURN .p2align 4 L(return_null): |