github.com/sandwichdev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/equal_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  // memequal(a, b unsafe.Pointer, size uintptr) bool
     9  TEXT runtime·memequal(SB),NOSPLIT,$0-25
    10  	MOVQ	a+0(FP), SI
    11  	MOVQ	b+8(FP), DI
    12  	CMPQ	SI, DI
    13  	JEQ	eq
    14  	MOVQ	size+16(FP), BX
    15  	LEAQ	ret+24(FP), AX
    16  	JMP	memeqbody<>(SB)
    17  eq:
    18  	MOVB	$1, ret+24(FP)
    19  	RET
    20  
    21  // memequal_varlen(a, b unsafe.Pointer) bool
    22  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
    23  	MOVQ	a+0(FP), SI
    24  	MOVQ	b+8(FP), DI
    25  	CMPQ	SI, DI
    26  	JEQ	eq
    27  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
    28  	LEAQ	ret+16(FP), AX
    29  	JMP	memeqbody<>(SB)
    30  eq:
    31  	MOVB	$1, ret+16(FP)
    32  	RET
    33  
    34  // a in SI
    35  // b in DI
    36  // count in BX
    37  // address of result byte in AX
    38  TEXT memeqbody<>(SB),NOSPLIT,$0-0
    39  	CMPQ	BX, $8
    40  	JB	small
    41  	CMPQ	BX, $64
    42  	JB	bigloop
    43  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    44  	JE	hugeloop_avx2
    45  
    46  	// 64 bytes at a time using xmm registers
    47  hugeloop:
    48  	CMPQ	BX, $64
    49  	JB	bigloop
    50  	MOVOU	(SI), X0
    51  	MOVOU	(DI), X1
    52  	MOVOU	16(SI), X2
    53  	MOVOU	16(DI), X3
    54  	MOVOU	32(SI), X4
    55  	MOVOU	32(DI), X5
    56  	MOVOU	48(SI), X6
    57  	MOVOU	48(DI), X7
    58  	PCMPEQB	X1, X0
    59  	PCMPEQB	X3, X2
    60  	PCMPEQB	X5, X4
    61  	PCMPEQB	X7, X6
    62  	PAND	X2, X0
    63  	PAND	X6, X4
    64  	PAND	X4, X0
    65  	PMOVMSKB X0, DX
    66  	ADDQ	$64, SI
    67  	ADDQ	$64, DI
    68  	SUBQ	$64, BX
    69  	CMPL	DX, $0xffff
    70  	JEQ	hugeloop
    71  	MOVB	$0, (AX)
    72  	RET
    73  
    74  	// 64 bytes at a time using ymm registers
    75  hugeloop_avx2:
    76  	CMPQ	BX, $64
    77  	JB	bigloop_avx2
    78  	VMOVDQU	(SI), Y0
    79  	VMOVDQU	(DI), Y1
    80  	VMOVDQU	32(SI), Y2
    81  	VMOVDQU	32(DI), Y3
    82  	VPCMPEQB	Y1, Y0, Y4
    83  	VPCMPEQB	Y2, Y3, Y5
    84  	VPAND	Y4, Y5, Y6
    85  	VPMOVMSKB Y6, DX
    86  	ADDQ	$64, SI
    87  	ADDQ	$64, DI
    88  	SUBQ	$64, BX
    89  	CMPL	DX, $0xffffffff
    90  	JEQ	hugeloop_avx2
    91  	VZEROUPPER
    92  	MOVB	$0, (AX)
    93  	RET
    94  
    95  bigloop_avx2:
    96  	VZEROUPPER
    97  
    98  	// 8 bytes at a time using 64-bit register
    99  bigloop:
   100  	CMPQ	BX, $8
   101  	JBE	leftover
   102  	MOVQ	(SI), CX
   103  	MOVQ	(DI), DX
   104  	ADDQ	$8, SI
   105  	ADDQ	$8, DI
   106  	SUBQ	$8, BX
   107  	CMPQ	CX, DX
   108  	JEQ	bigloop
   109  	MOVB	$0, (AX)
   110  	RET
   111  
   112  	// remaining 0-8 bytes
   113  leftover:
   114  	MOVQ	-8(SI)(BX*1), CX
   115  	MOVQ	-8(DI)(BX*1), DX
   116  	CMPQ	CX, DX
   117  	SETEQ	(AX)
   118  	RET
   119  
   120  small:
   121  	CMPQ	BX, $0
   122  	JEQ	equal
   123  
   124  	LEAQ	0(BX*8), CX
   125  	NEGQ	CX
   126  
   127  	CMPB	SI, $0xf8
   128  	JA	si_high
   129  
   130  	// load at SI won't cross a page boundary.
   131  	MOVQ	(SI), SI
   132  	JMP	si_finish
   133  si_high:
   134  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   135  	MOVQ	-8(SI)(BX*1), SI
   136  	SHRQ	CX, SI
   137  si_finish:
   138  
   139  	// same for DI.
   140  	CMPB	DI, $0xf8
   141  	JA	di_high
   142  	MOVQ	(DI), DI
   143  	JMP	di_finish
   144  di_high:
   145  	MOVQ	-8(DI)(BX*1), DI
   146  	SHRQ	CX, DI
   147  di_finish:
   148  
   149  	SUBQ	SI, DI
   150  	SHLQ	CX, DI
   151  equal:
   152  	SETEQ	(AX)
   153  	RET
   154