github.com/primecitizens/pcz/std@v0.2.1/core/mem/equal_amd64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && amd64
     9  
    10  #include "textflag.h"
    11  
    12  // Equal(a, b unsafe.Pointer, size uintptr) bool
    13  TEXT ·Equal<ABIInternal>(SB),NOSPLIT,$0-25
    14  	// AX = a    (want in SI)
    15  	// BX = b    (want in DI)
    16  	// CX = size (want in BX)
    17  	CMPQ AX, BX
    18  	JNE neq
    19  	MOVQ $1, AX // return 1
    20  	RET
    21  neq:
    22  	MOVQ AX, SI
    23  	MOVQ BX, DI
    24  	MOVQ CX, BX
    25  	JMP memeqbody<>(SB)
    26  
    27  // memequal_varlen(a, b unsafe.Pointer) bool
    28  TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
    29  	// AX = a       (want in SI)
    30  	// BX = b       (want in DI)
    31  	// 8(DX) = size (want in BX)
    32  	CMPQ AX, BX
    33  	JNE neq
    34  	MOVQ $1, AX // return 1
    35  	RET
    36  neq:
    37  	MOVQ AX, SI
    38  	MOVQ BX, DI
    39  	MOVQ 8(DX), BX    // compiler stores size at offset 8 in the closure
    40  	JMP memeqbody<>(SB)
    41  
    42  // Input:
    43  //   a in SI
    44  //   b in DI
    45  //   count in BX
    46  // Output:
    47  //   result in AX
    48  TEXT memeqbody<>(SB),NOSPLIT,$0-0
    49  	CMPQ BX, $8
    50  	JB small
    51  	CMPQ BX, $64
    52  	JB bigloop
    53  #ifndef hasAVX2
    54  	CMPB ·hasAVX2(SB), $1
    55  	JE hugeloop_avx2
    56  
    57  	// 64 bytes at a time using xmm registers
    58  hugeloop:
    59  	CMPQ BX, $64
    60  	JB bigloop
    61  	MOVOU (SI), X0
    62  	MOVOU (DI), X1
    63  	MOVOU 16(SI), X2
    64  	MOVOU 16(DI), X3
    65  	MOVOU 32(SI), X4
    66  	MOVOU 32(DI), X5
    67  	MOVOU 48(SI), X6
    68  	MOVOU 48(DI), X7
    69  	PCMPEQB X1, X0
    70  	PCMPEQB X3, X2
    71  	PCMPEQB X5, X4
    72  	PCMPEQB X7, X6
    73  	PAND X2, X0
    74  	PAND X6, X4
    75  	PAND X4, X0
    76  	PMOVMSKB X0, DX
    77  	ADDQ $64, SI
    78  	ADDQ $64, DI
    79  	SUBQ $64, BX
    80  	CMPL DX, $0xffff
    81  	JEQ hugeloop
    82  	XORQ AX, AX // return 0
    83  	RET
    84  #endif
    85  
    86  	// 64 bytes at a time using ymm registers
    87  hugeloop_avx2:
    88  	CMPQ BX, $64
    89  	JB bigloop_avx2
    90  	VMOVDQU (SI), Y0
    91  	VMOVDQU (DI), Y1
    92  	VMOVDQU 32(SI), Y2
    93  	VMOVDQU 32(DI), Y3
    94  	VPCMPEQB Y1, Y0, Y4
    95  	VPCMPEQB Y2, Y3, Y5
    96  	VPAND Y4, Y5, Y6
    97  	VPMOVMSKB Y6, DX
    98  	ADDQ $64, SI
    99  	ADDQ $64, DI
   100  	SUBQ $64, BX
   101  	CMPL DX, $0xffffffff
   102  	JEQ hugeloop_avx2
   103  	VZEROUPPER
   104  	XORQ AX, AX // return 0
   105  	RET
   106  
   107  bigloop_avx2:
   108  	VZEROUPPER
   109  
   110  	// 8 bytes at a time using 64-bit register
   111  bigloop:
   112  	CMPQ BX, $8
   113  	JBE leftover
   114  	MOVQ (SI), CX
   115  	MOVQ (DI), DX
   116  	ADDQ $8, SI
   117  	ADDQ $8, DI
   118  	SUBQ $8, BX
   119  	CMPQ CX, DX
   120  	JEQ bigloop
   121  	XORQ AX, AX // return 0
   122  	RET
   123  
   124  	// remaining 0-8 bytes
   125  leftover:
   126  	MOVQ -8(SI)(BX*1), CX
   127  	MOVQ -8(DI)(BX*1), DX
   128  	CMPQ CX, DX
   129  	SETEQ AX
   130  	RET
   131  
   132  small:
   133  	CMPQ BX, $0
   134  	JEQ equal
   135  
   136  	LEAQ 0(BX*8), CX
   137  	NEGQ CX
   138  
   139  	CMPB SI, $0xf8
   140  	JA si_high
   141  
   142  	// load at SI won't cross a page boundary.
   143  	MOVQ (SI), SI
   144  	JMP si_finish
   145  si_high:
   146  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   147  	MOVQ -8(SI)(BX*1), SI
   148  	SHRQ CX, SI
   149  si_finish:
   150  
   151  	// same for DI.
   152  	CMPB DI, $0xf8
   153  	JA di_high
   154  	MOVQ (DI), DI
   155  	JMP di_finish
   156  di_high:
   157  	MOVQ -8(DI)(BX*1), DI
   158  	SHRQ CX, DI
   159  di_finish:
   160  
   161  	SUBQ SI, DI
   162  	SHLQ CX, DI
   163  equal:
   164  	SETEQ AX
   165  	RET