// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "go_asm.h"
#include "textflag.h"

TEXT ·Equal(SB),NOSPLIT,$0-25
	MOVL	a_len+4(FP), BX
	MOVL	b_len+16(FP), CX
	CMPL	BX, CX
	JNE	neq
	MOVL	a_base+0(FP), SI
	MOVL	b_base+12(FP), DI
	CMPL	SI, DI
	JEQ	eq
	LEAL	ret+24(FP), AX
	JMP	memeqbody<>(SB)
neq:
	MOVB	$0, ret+24(FP)
	RET
eq:
	MOVB	$1, ret+24(FP)
	RET

TEXT bytes·Equal(SB),NOSPLIT,$0-25
	FUNCDATA $0, ·Equal·args_stackmap(SB)
	MOVL	a_len+4(FP), BX
	MOVL	b_len+16(FP), CX
	CMPL	BX, CX
	JNE	neq
	MOVL	a_base+0(FP), SI
	MOVL	b_base+12(FP), DI
	CMPL	SI, DI
	JEQ	eq
	LEAL	ret+24(FP), AX
	JMP	memeqbody<>(SB)
neq:
	MOVB	$0, ret+24(FP)
	RET
eq:
	MOVB	$1, ret+24(FP)
	RET

// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal(SB),NOSPLIT,$0-13
	MOVL	a+0(FP), SI
	MOVL	b+4(FP), DI
	CMPL	SI, DI
	JEQ	eq
	MOVL	size+8(FP), BX
	LEAL	ret+12(FP), AX
	JMP	memeqbody<>(SB)
eq:
	MOVB    $1, ret+12(FP)
	RET

// memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
	MOVL    a+0(FP), SI
	MOVL    b+4(FP), DI
	CMPL    SI, DI
	JEQ     eq
	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
	LEAL	ret+8(FP), AX
	JMP	memeqbody<>(SB)
eq:
	MOVB    $1, ret+8(FP)
	RET

// a in SI
// b in DI
// count in BX
// address of result byte in AX
TEXT memeqbody<>(SB),NOSPLIT,$0-0
	CMPL	BX, $4
	JB	small

	// 64 bytes at a time using xmm registers
hugeloop:
	CMPL	BX, $64
	JB	bigloop
	CMPB	internal∕cpu·X86+const_x86_HasSSE2(SB), $1
	JNE	bigloop
	MOVOU	(SI), X0
	MOVOU	(DI), X1
	MOVOU	16(SI), X2
	MOVOU	16(DI), X3
	MOVOU	32(SI), X4
	MOVOU	32(DI), X5
	MOVOU	48(SI), X6
	MOVOU	48(DI), X7
	PCMPEQB	X1, X0
	PCMPEQB	X3, X2
	PCMPEQB	X5, X4
	PCMPEQB	X7, X6
	PAND	X2, X0
	PAND	X6, X4
	PAND	X4, X0
	PMOVMSKB X0, DX
	ADDL	$64, SI
	ADDL	$64, DI
	SUBL	$64, BX
	CMPL	DX, $0xffff
	JEQ	hugeloop
	MOVB	$0, (AX)
	RET

	// 4 bytes at a time using 32-bit register
bigloop:
	CMPL	BX, $4
	JBE	leftover
	MOVL	(SI), CX
	MOVL	(DI), DX
	ADDL	$4, SI
	ADDL	$4, DI
	SUBL	$4, BX
	CMPL	CX, DX
	JEQ	bigloop
	MOVB	$0, (AX)
	RET

	// remaining 0-4 bytes
leftover:
	MOVL	-4(SI)(BX*1), CX
	MOVL	-4(DI)(BX*1), DX
	CMPL	CX, DX
	SETEQ	(AX)
	RET

small:
	CMPL	BX, $0
	JEQ	equal

	LEAL	0(BX*8), CX
	NEGL	CX

	MOVL	SI, DX
	CMPB	DX, $0xfc
	JA	si_high

	// load at SI won't cross a page boundary.
	MOVL	(SI), SI
	JMP	si_finish
si_high:
	// address ends in 111111xx. Load up to bytes we want, move to correct position.
	MOVL	-4(SI)(BX*1), SI
	SHRL	CX, SI
si_finish:

	// same for DI.
	MOVL	DI, DX
	CMPB	DX, $0xfc
	JA	di_high
	MOVL	(DI), DI
	JMP	di_finish
di_high:
	MOVL	-4(DI)(BX*1), DI
	SHRL	CX, DI
di_finish:

	SUBL	SI, DI
	SHLL	CX, DI
equal:
	SETEQ	(AX)
	RET
