github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/bytealg/equal_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Equal(SB),NOSPLIT,$0-49
     9  	MOVQ	a_len+8(FP), BX
    10  	MOVQ	b_len+32(FP), CX
    11  	CMPQ	BX, CX
    12  	JNE	neq
    13  	MOVQ	a_base+0(FP), SI
    14  	MOVQ	b_base+24(FP), DI
    15  	CMPQ	SI, DI
    16  	JEQ	eq
    17  	LEAQ	ret+48(FP), AX
    18  	JMP	memeqbody<>(SB)
    19  neq:
    20  	MOVB	$0, ret+48(FP)
    21  	RET
    22  eq:
    23  	MOVB	$1, ret+48(FP)
    24  	RET
    25  
    26  // memequal(a, b unsafe.Pointer, size uintptr) bool
    27  TEXT runtime·memequal(SB),NOSPLIT,$0-25
    28  	MOVQ	a+0(FP), SI
    29  	MOVQ	b+8(FP), DI
    30  	CMPQ	SI, DI
    31  	JEQ	eq
    32  	MOVQ	size+16(FP), BX
    33  	LEAQ	ret+24(FP), AX
    34  	JMP	memeqbody<>(SB)
    35  eq:
    36  	MOVB	$1, ret+24(FP)
    37  	RET
    38  
    39  // memequal_varlen(a, b unsafe.Pointer) bool
    40  TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
    41  	MOVQ	a+0(FP), SI
    42  	MOVQ	b+8(FP), DI
    43  	CMPQ	SI, DI
    44  	JEQ	eq
    45  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
    46  	LEAQ	ret+16(FP), AX
    47  	JMP	memeqbody<>(SB)
    48  eq:
    49  	MOVB	$1, ret+16(FP)
    50  	RET
    51  
    52  // a in SI
    53  // b in DI
    54  // count in BX
    55  // address of result byte in AX
    56  TEXT memeqbody<>(SB),NOSPLIT,$0-0
    57  	CMPQ	BX, $8
    58  	JB	small
    59  	CMPQ	BX, $64
    60  	JB	bigloop
    61  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    62  	JE	hugeloop_avx2
    63  
    64  	// 64 bytes at a time using xmm registers
    65  hugeloop:
    66  	CMPQ	BX, $64
    67  	JB	bigloop
    68  	MOVOU	(SI), X0
    69  	MOVOU	(DI), X1
    70  	MOVOU	16(SI), X2
    71  	MOVOU	16(DI), X3
    72  	MOVOU	32(SI), X4
    73  	MOVOU	32(DI), X5
    74  	MOVOU	48(SI), X6
    75  	MOVOU	48(DI), X7
    76  	PCMPEQB	X1, X0
    77  	PCMPEQB	X3, X2
    78  	PCMPEQB	X5, X4
    79  	PCMPEQB	X7, X6
    80  	PAND	X2, X0
    81  	PAND	X6, X4
    82  	PAND	X4, X0
    83  	PMOVMSKB X0, DX
    84  	ADDQ	$64, SI
    85  	ADDQ	$64, DI
    86  	SUBQ	$64, BX
    87  	CMPL	DX, $0xffff
    88  	JEQ	hugeloop
    89  	MOVB	$0, (AX)
    90  	RET
    91  
    92  	// 64 bytes at a time using ymm registers
    93  hugeloop_avx2:
    94  	CMPQ	BX, $64
    95  	JB	bigloop_avx2
    96  	VMOVDQU	(SI), Y0
    97  	VMOVDQU	(DI), Y1
    98  	VMOVDQU	32(SI), Y2
    99  	VMOVDQU	32(DI), Y3
   100  	VPCMPEQB	Y1, Y0, Y4
   101  	VPCMPEQB	Y2, Y3, Y5
   102  	VPAND	Y4, Y5, Y6
   103  	VPMOVMSKB Y6, DX
   104  	ADDQ	$64, SI
   105  	ADDQ	$64, DI
   106  	SUBQ	$64, BX
   107  	CMPL	DX, $0xffffffff
   108  	JEQ	hugeloop_avx2
   109  	VZEROUPPER
   110  	MOVB	$0, (AX)
   111  	RET
   112  
   113  bigloop_avx2:
   114  	VZEROUPPER
   115  
   116  	// 8 bytes at a time using 64-bit register
   117  bigloop:
   118  	CMPQ	BX, $8
   119  	JBE	leftover
   120  	MOVQ	(SI), CX
   121  	MOVQ	(DI), DX
   122  	ADDQ	$8, SI
   123  	ADDQ	$8, DI
   124  	SUBQ	$8, BX
   125  	CMPQ	CX, DX
   126  	JEQ	bigloop
   127  	MOVB	$0, (AX)
   128  	RET
   129  
   130  	// remaining 0-8 bytes
   131  leftover:
   132  	MOVQ	-8(SI)(BX*1), CX
   133  	MOVQ	-8(DI)(BX*1), DX
   134  	CMPQ	CX, DX
   135  	SETEQ	(AX)
   136  	RET
   137  
   138  small:
   139  	CMPQ	BX, $0
   140  	JEQ	equal
   141  
   142  	LEAQ	0(BX*8), CX
   143  	NEGQ	CX
   144  
   145  	CMPB	SI, $0xf8
   146  	JA	si_high
   147  
   148  	// load at SI won't cross a page boundary.
   149  	MOVQ	(SI), SI
   150  	JMP	si_finish
   151  si_high:
   152  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   153  	MOVQ	-8(SI)(BX*1), SI
   154  	SHRQ	CX, SI
   155  si_finish:
   156  
   157  	// same for DI.
   158  	CMPB	DI, $0xf8
   159  	JA	di_high
   160  	MOVQ	(DI), DI
   161  	JMP	di_finish
   162  di_high:
   163  	MOVQ	-8(DI)(BX*1), DI
   164  	SHRQ	CX, DI
   165  di_finish:
   166  
   167  	SUBQ	SI, DI
   168  	SHLQ	CX, DI
   169  equal:
   170  	SETEQ	(AX)
   171  	RET
   172