github.com/JimmyHuang454/JLS-go@v0.0.0-20230831150107-90d536585ba0/internal/bytealg/equal_amd64.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "asm_amd64.h"
     7  #include "textflag.h"
     8  
     9  // memequal(a, b unsafe.Pointer, size uintptr) bool
    10  TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
    11  	// AX = a    (want in SI)
    12  	// BX = b    (want in DI)
    13  	// CX = size (want in BX)
    14  	CMPQ	AX, BX
    15  	JNE	neq
    16  	MOVQ	$1, AX	// return 1
    17  	RET
    18  neq:
    19  	MOVQ	AX, SI
    20  	MOVQ	BX, DI
    21  	MOVQ	CX, BX
    22  	JMP	memeqbody<>(SB)
    23  
    24  // memequal_varlen(a, b unsafe.Pointer) bool
    25  TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
    26  	// AX = a       (want in SI)
    27  	// BX = b       (want in DI)
    28  	// 8(DX) = size (want in BX)
    29  	CMPQ	AX, BX
    30  	JNE	neq
    31  	MOVQ	$1, AX	// return 1
    32  	RET
    33  neq:
    34  	MOVQ	AX, SI
    35  	MOVQ	BX, DI
    36  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
    37  	JMP	memeqbody<>(SB)
    38  
    39  // Input:
    40  //   a in SI
    41  //   b in DI
    42  //   count in BX
    43  // Output:
    44  //   result in AX
    45  TEXT memeqbody<>(SB),NOSPLIT,$0-0
    46  	CMPQ	BX, $8
    47  	JB	small
    48  	CMPQ	BX, $64
    49  	JB	bigloop
    50  #ifndef hasAVX2
    51  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    52  	JE	hugeloop_avx2
    53  
    54  	// 64 bytes at a time using xmm registers
    55  hugeloop:
    56  	CMPQ	BX, $64
    57  	JB	bigloop
    58  	MOVOU	(SI), X0
    59  	MOVOU	(DI), X1
    60  	MOVOU	16(SI), X2
    61  	MOVOU	16(DI), X3
    62  	MOVOU	32(SI), X4
    63  	MOVOU	32(DI), X5
    64  	MOVOU	48(SI), X6
    65  	MOVOU	48(DI), X7
    66  	PCMPEQB	X1, X0
    67  	PCMPEQB	X3, X2
    68  	PCMPEQB	X5, X4
    69  	PCMPEQB	X7, X6
    70  	PAND	X2, X0
    71  	PAND	X6, X4
    72  	PAND	X4, X0
    73  	PMOVMSKB X0, DX
    74  	ADDQ	$64, SI
    75  	ADDQ	$64, DI
    76  	SUBQ	$64, BX
    77  	CMPL	DX, $0xffff
    78  	JEQ	hugeloop
    79  	XORQ	AX, AX	// return 0
    80  	RET
    81  #endif
    82  
    83  	// 64 bytes at a time using ymm registers
    84  hugeloop_avx2:
    85  	CMPQ	BX, $64
    86  	JB	bigloop_avx2
    87  	VMOVDQU	(SI), Y0
    88  	VMOVDQU	(DI), Y1
    89  	VMOVDQU	32(SI), Y2
    90  	VMOVDQU	32(DI), Y3
    91  	VPCMPEQB	Y1, Y0, Y4
    92  	VPCMPEQB	Y2, Y3, Y5
    93  	VPAND	Y4, Y5, Y6
    94  	VPMOVMSKB Y6, DX
    95  	ADDQ	$64, SI
    96  	ADDQ	$64, DI
    97  	SUBQ	$64, BX
    98  	CMPL	DX, $0xffffffff
    99  	JEQ	hugeloop_avx2
   100  	VZEROUPPER
   101  	XORQ	AX, AX	// return 0
   102  	RET
   103  
   104  bigloop_avx2:
   105  	VZEROUPPER
   106  
   107  	// 8 bytes at a time using 64-bit register
   108  bigloop:
   109  	CMPQ	BX, $8
   110  	JBE	leftover
   111  	MOVQ	(SI), CX
   112  	MOVQ	(DI), DX
   113  	ADDQ	$8, SI
   114  	ADDQ	$8, DI
   115  	SUBQ	$8, BX
   116  	CMPQ	CX, DX
   117  	JEQ	bigloop
   118  	XORQ	AX, AX	// return 0
   119  	RET
   120  
   121  	// remaining 0-8 bytes
   122  leftover:
   123  	MOVQ	-8(SI)(BX*1), CX
   124  	MOVQ	-8(DI)(BX*1), DX
   125  	CMPQ	CX, DX
   126  	SETEQ	AX
   127  	RET
   128  
   129  small:
   130  	CMPQ	BX, $0
   131  	JEQ	equal
   132  
   133  	LEAQ	0(BX*8), CX
   134  	NEGQ	CX
   135  
   136  	CMPB	SI, $0xf8
   137  	JA	si_high
   138  
   139  	// load at SI won't cross a page boundary.
   140  	MOVQ	(SI), SI
   141  	JMP	si_finish
   142  si_high:
   143  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   144  	MOVQ	-8(SI)(BX*1), SI
   145  	SHRQ	CX, SI
   146  si_finish:
   147  
   148  	// same for DI.
   149  	CMPB	DI, $0xf8
   150  	JA	di_high
   151  	MOVQ	(DI), DI
   152  	JMP	di_finish
   153  di_high:
   154  	MOVQ	-8(DI)(BX*1), DI
   155  	SHRQ	CX, DI
   156  di_finish:
   157  
   158  	SUBQ	SI, DI
   159  	SHLQ	CX, DI
   160  equal:
   161  	SETEQ	AX
   162  	RET