/* memcmp with SSE2.
   Copyright (C) 2017-2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */


#include <isa-level.h>

/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
   so we need this to build for ISA V2 builds. */
#if ISA_SHOULD_BUILD (2)

#include <sysdep.h>

# ifndef MEMCMP
#  define MEMCMP __memcmp_sse2
# endif

# ifdef USE_AS_WMEMCMP
#  define PCMPEQ	pcmpeqd
#  define CHAR_SIZE	4
#  define SIZE_OFFSET	(0)
# else
#  define PCMPEQ	pcmpeqb
#  define CHAR_SIZE	1
# endif

# ifdef USE_AS_MEMCMPEQ
#  define SIZE_OFFSET	(0)
#  define CHECK_CMP(x, y)	subl x, y
# else
#  ifndef SIZE_OFFSET
#   define SIZE_OFFSET	(CHAR_PER_VEC * 2)
#  endif
#  define CHECK_CMP(x, y)	cmpl x, y
# endif

# define VEC_SIZE	16
# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)

# ifndef MEMCMP
#  define MEMCMP	memcmp
# endif

	.text
ENTRY(MEMCMP)
#  ifdef __ILP32__
	/* Clear the upper 32 bits.  */
	movl	%edx, %edx
#  endif
# ifdef USE_AS_WMEMCMP
	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
	   in ecx for code size. This is preferable to using `incw` as
	   it avoids partial register stalls on older hardware (pre
	   SnB).  */
	movl	$0xffff, %ecx
# endif
	cmpq	$CHAR_PER_VEC, %rdx
	ja	L(more_1x_vec)

# ifdef USE_AS_WMEMCMP
	/* saves a byte of code keeping the fall through path n = [2, 4]
	   in the initial cache line.  */
	decl	%edx
	jle	L(cmp_0_1)

	movq	(%rsi), %xmm0
	movq	(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_vec_start_0)

	movq	-4(%rsi, %rdx, CHAR_SIZE), %xmm0
	movq	-4(%rdi, %rdx, CHAR_SIZE), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_vec_end_0_adj)
# else
	cmpl	$8, %edx
	ja	L(cmp_9_16)

	cmpl	$4, %edx
	jb	L(cmp_0_3)

#  ifdef USE_AS_MEMCMPEQ
	movl	(%rsi), %eax
	subl	(%rdi), %eax

	movl	-4(%rsi, %rdx), %esi
	subl	-4(%rdi, %rdx), %esi

	orl	%esi, %eax
	ret
#  else
	/* Combine comparisons for lo and hi 4-byte comparisons.  */
	movl	-4(%rsi, %rdx), %ecx
	movl	-4(%rdi, %rdx), %eax
	shlq	$32, %rcx
	shlq	$32, %rax
	movl	(%rsi), %esi
	movl	(%rdi), %edi
	orq	%rsi, %rcx
	orq	%rdi, %rax
	/* Only compute proper return if not-equal.  */
	cmpq	%rcx, %rax
	jnz	L(ret_nonzero)
	xorl	%eax, %eax
	ret
#  endif

	.p2align 4,, 10
L(cmp_9_16):
#  ifdef USE_AS_MEMCMPEQ
	movq	(%rsi), %rax
	subq	(%rdi), %rax

	movq	-8(%rsi, %rdx), %rcx
	subq	-8(%rdi, %rdx), %rcx
	orq	%rcx, %rax
	/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
	   return long).  */
	setnz	%cl
	movzbl	%cl, %eax
#  else
	movq	(%rsi), %rcx
	movq	(%rdi), %rax
	/* Only compute proper return if not-equal.  */
	cmpq	%rcx, %rax
	jnz	L(ret_nonzero)

	movq	-8(%rsi, %rdx, CHAR_SIZE), %rcx
	movq	-8(%rdi, %rdx, CHAR_SIZE), %rax
	/* Only compute proper return if not-equal.  */
	cmpq	%rcx, %rax
	jnz	L(ret_nonzero)
	xorl	%eax, %eax
#  endif
# endif
	ret

	.p2align 4,, 8
L(cmp_0_1):
	/* Flag set by earlier comparison against 1.  */
	jne	L(cmp_0_0)
# ifdef USE_AS_WMEMCMP
	movl	(%rdi), %ecx
	xorl	%edx, %edx
	cmpl	(%rsi), %ecx
	je	L(cmp_0_0)
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
# else
	movzbl	(%rdi), %eax
	movzbl	(%rsi), %ecx
	subl	%ecx, %eax
# endif
	ret

	/* Fits in aligning bytes.  */
L(cmp_0_0):
	xorl	%eax, %eax
	ret

# ifdef USE_AS_WMEMCMP
	.p2align 4
L(ret_nonzero_vec_start_0):
	bsfl	%eax, %eax
	movl	(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
	ret
# else

#  ifndef USE_AS_MEMCMPEQ
	.p2align 4,, 14
L(ret_nonzero):
	/* Need to bswap to get proper return without branch.  */
	bswapq	%rcx
	bswapq	%rax
	subq	%rcx, %rax
	sbbl	%eax, %eax
	orl	$1, %eax
	ret
#  endif

	.p2align 4
L(cmp_0_3):
#  ifdef USE_AS_MEMCMPEQ
	/* No reason to add to dependency chain on rdx. Saving a the
	   bytes here doesn't change number of fetch blocks.  */
	cmpl	$1, %edx
	jbe	L(cmp_0_1)
#  else
	/* We need the code size to prevent taking an extra fetch block.
	 */
	decl	%edx
	jle	L(cmp_0_1)
#  endif
	movzwl	(%rsi), %ecx
	movzwl	(%rdi), %eax

#  ifdef USE_AS_MEMCMPEQ
	subl	%ecx, %eax

	movzbl	-1(%rsi, %rdx), %esi
	movzbl	-1(%rdi, %rdx), %edi
	subl	%edi, %esi
	orl	%esi, %eax
#  else
	bswapl	%ecx
	bswapl	%eax

	/* Implicit right shift by one. We just need to displace the
	   sign bits.  */
	shrl	%ecx
	shrl	%eax

	/* Eat a partial register stall here. Saves code stopping
	   L(cmp_0_3) from bleeding into the next fetch block and saves
	   an ALU.  */
	movb	(%rsi, %rdx), %cl
	movzbl	(%rdi, %rdx), %edi
	orl	%edi, %eax
	subl	%ecx, %eax
#  endif
	ret
# endif

	.p2align 5
L(more_1x_vec):
# ifndef USE_AS_WMEMCMP
	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
	   in ecx for code size. This is preferable to using `incw` as
	   it avoids partial register stalls on older hardware (pre
	   SnB).  */
	movl	$0xffff, %ecx
# endif
	movups	(%rsi), %xmm0
	movups	(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_vec_start_0)
# if SIZE_OFFSET == 0
	cmpq	$(CHAR_PER_VEC * 2), %rdx
# else
	/* Offset rdx. Saves just enough code size to keep the
	   L(last_2x_vec) case and the non-zero return in a single
	   cache line.  */
	subq	$(CHAR_PER_VEC * 2), %rdx
# endif
	ja	L(more_2x_vec)

	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
# ifndef USE_AS_MEMCMPEQ
	/* Don't use `incw ax` as machines this code runs on are liable
	   to have partial register stall.  */
	jnz	L(ret_nonzero_vec_end_0)
# else
	/* Various return targets for memcmpeq. Will always be hot in
	   Icache and get short encoding.  */
L(ret_nonzero_vec_start_1):
L(ret_nonzero_vec_start_0):
L(ret_nonzero_vec_end_0):
# endif
	ret

# ifndef USE_AS_MEMCMPEQ
#  ifdef USE_AS_WMEMCMP
	.p2align 4
L(ret_nonzero_vec_end_0_adj):
	addl	$3, %edx
#  else
	.p2align 4,, 8
#  endif
L(ret_nonzero_vec_end_0):
	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	leal	(%rax, %rdx, CHAR_SIZE), %eax
	movl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	/* Use `addq` instead of `addl` here so that even if `rax` + `rdx`
       is negative value of the sum will be usable as a 64-bit offset
       (negative 32-bit numbers zero-extend to a large and often
       out-of-bounds 64-bit offsets).  Note that `rax` + `rdx` >= 0 is
       an invariant when `memcmp` is used correctly, but if the input
       strings `rsi`/`rdi` are concurrently modified as the function
       runs (there is a Data-Race) it is possible for `rax` + `rdx` to
       be negative.  Given that there is virtually no extra to cost
       using `addq` instead of `addl` we may as well protect the
       data-race case.  */
	addq	%rdx, %rax
	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret
#  ifndef USE_AS_WMEMCMP
	.p2align 4,, 10
L(ret_nonzero_vec_start_0):
	bsfl	%eax, %eax
	movzbl	(%rsi, %rax), %ecx
	movzbl	(%rdi, %rax), %eax
	subl	%ecx, %eax
	ret
#  endif
# else
# endif

	.p2align 5
L(more_2x_vec):
	movups	(VEC_SIZE * 1)(%rsi), %xmm0
	movups	(VEC_SIZE * 1)(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	pmovmskb %xmm1, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_vec_start_1)

	cmpq	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
	jbe	L(last_2x_vec)

	cmpq	$(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
	ja	L(more_8x_vec)

	/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
	   This can harm performance if non-zero return in [65, 80] or
	   [97, 112] but helps performance otherwise. Generally zero-
	   return is hotter.  */
	movups	(VEC_SIZE * 2)(%rsi), %xmm0
	movups	(VEC_SIZE * 2)(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	movups	(VEC_SIZE * 3)(%rsi), %xmm2
	movups	(VEC_SIZE * 3)(%rdi), %xmm3
	PCMPEQ	%xmm2, %xmm3
	pand	%xmm1, %xmm3

	pmovmskb %xmm3, %eax
	CHECK_CMP (%ecx, %eax)
	jnz	L(ret_nonzero_vec_start_2_3)

	cmpl	$(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
	jbe	L(last_2x_vec)

	movups	(VEC_SIZE * 4)(%rsi), %xmm0
	movups	(VEC_SIZE * 4)(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	movups	(VEC_SIZE * 5)(%rsi), %xmm2
	movups	(VEC_SIZE * 5)(%rdi), %xmm3
	PCMPEQ	%xmm2, %xmm3
	pand	%xmm1, %xmm3

	pmovmskb %xmm3, %eax
	CHECK_CMP (%ecx, %eax)
# ifdef USE_AS_MEMCMPEQ
	jz	L(last_2x_vec)
	ret
# else
	jnz	L(ret_nonzero_vec_start_4_5)
# endif
	.p2align 4
L(last_2x_vec):
	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
	PCMPEQ	%xmm0, %xmm1
	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
	PCMPEQ	%xmm2, %xmm3
	pand	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	subl	%ecx, %eax
# ifdef USE_AS_MEMCMPEQ
	/* Various return targets for memcmpeq. Will always be hot in
	   Icache and get short encoding.  */
L(ret_nonzero_vec_start_2_3):
L(ret_nonzero_vec_start_4_5):
	ret
# else
	jnz	L(ret_nonzero_vec_end_1)
	ret

	.p2align 4,, 8
L(ret_nonzero_vec_end_1):
	pmovmskb %xmm1, %ecx
	/* High 16 bits of eax guaranteed to be all ones. Rotate them in
	   to we can do `or + not` with just `xor`.  */
	rorl	$16, %eax
	xorl	%ecx, %eax
	/* Partial register stall.  */

	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	leal	(%rax, %rdx, CHAR_SIZE), %eax
	movl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	addl	%edx, %eax
	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret

	.p2align 4
L(ret_nonzero_vec_start_4_5):
	pmovmskb %xmm1, %edx
	sall	$16, %eax
	leal	1(%rax, %rdx), %eax
	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	movl	(VEC_SIZE * 4)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	movzbl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * 4)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret

	.p2align 4,, 8
L(ret_nonzero_vec_start_1):
	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	movl	(VEC_SIZE * 1)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	movzbl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * 1)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret
# endif

	.p2align 4
L(more_8x_vec):
	subq	%rdi, %rsi
	leaq	(VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
	andq	$(VEC_SIZE * -1), %rdi
	addq	%rdi, %rsi
	.p2align 4
L(loop_4x):
	movups	(VEC_SIZE * 2)(%rsi), %xmm0
	movups	(VEC_SIZE * 3)(%rsi), %xmm1

	PCMPEQ	(VEC_SIZE * 2)(%rdi), %xmm0
	PCMPEQ	(VEC_SIZE * 3)(%rdi), %xmm1

	movups	(VEC_SIZE * 4)(%rsi), %xmm2
	movups	(VEC_SIZE * 5)(%rsi), %xmm3

	PCMPEQ	(VEC_SIZE * 4)(%rdi), %xmm2
	PCMPEQ	(VEC_SIZE * 5)(%rdi), %xmm3

	pand	%xmm0, %xmm1
	pand	%xmm2, %xmm3
	pand	%xmm1, %xmm3

	pmovmskb %xmm3, %eax
	subl	%ecx, %eax
	jnz	L(ret_nonzero_loop)

	addq	$(VEC_SIZE * 4), %rdi
	addq	$(VEC_SIZE * 4), %rsi
	cmpq	%rdi, %rdx
	ja	L(loop_4x)
	/* Get remaining length in edx.  */
	subl	%edi, %edx
	/* Restore offset so we can reuse L(last_2x_vec).  */
	addl	$(VEC_SIZE * 6 - SIZE_OFFSET), %edx
# ifdef USE_AS_WMEMCMP
	shrl	$2, %edx
# endif
	cmpl	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
	jbe	L(last_2x_vec)


	movups	(VEC_SIZE * 2)(%rsi), %xmm0
	movups	(VEC_SIZE * 2)(%rdi), %xmm1
	PCMPEQ	%xmm0, %xmm1
	movups	(VEC_SIZE * 3)(%rsi), %xmm2
	movups	(VEC_SIZE * 3)(%rdi), %xmm3
	PCMPEQ	%xmm2, %xmm3
	pand	%xmm1, %xmm3

	pmovmskb %xmm3, %eax
	CHECK_CMP (%ecx, %eax)
	jz	L(last_2x_vec)
# ifdef USE_AS_MEMCMPEQ
L(ret_nonzero_loop):
	ret
# else

	.p2align 4
L(ret_nonzero_vec_start_2_3):
	pmovmskb %xmm1, %edx
	sall	$16, %eax
	leal	1(%rax, %rdx), %eax

	bsfl	%eax, %eax
#  ifdef USE_AS_WMEMCMP
	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret

	.p2align 4
L(ret_nonzero_loop):
	pmovmskb %xmm0, %ecx
	pmovmskb %xmm1, %edx
	sall	$(VEC_SIZE * 1), %edx
	leal	1(%rcx, %rdx), %edx
	pmovmskb %xmm2, %ecx
	/* High 16 bits of eax guaranteed to be all ones. Rotate them in
	   to we can do `or + not` with just `xor`.  */
	rorl	$16, %eax
	xorl	%ecx, %eax

	salq	$32, %rax
	orq	%rdx, %rax

	bsfq	%rax, %rax
#  ifdef USE_AS_WMEMCMP
	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
	xorl	%edx, %edx
	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
	/* NB: no partial register stall here because xorl zero idiom
	   above.  */
	setg	%dl
	leal	-1(%rdx, %rdx), %eax
#  else
	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
	subl	%ecx, %eax
#  endif
	ret
# endif
END(MEMCMP)
#endif