github.com/primecitizens/pcz/std@v0.2.1/core/mem/equal_arm64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2018 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && arm64
     9  
    10  #include "textflag.h"
    11  
    12  // Equal(a, b unsafe.Pointer, size uintptr) bool
    13  TEXT ·Equal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
    14  	// short path to handle 0-byte case
    15  	CBZ R2, equal
    16  	B memeqbody<>(SB)
    17  equal:
    18  	MOVD $1, R0
    19  	RET
    20  
    21  // memequal_varlen(a, b unsafe.Pointer) bool
    22  TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
    23  	CMP R0, R1
    24  	BEQ eq
    25  	MOVD 8(R26), R2    // compiler stores size at offset 8 in the closure
    26  	CBZ R2, eq
    27  	B memeqbody<>(SB)
    28  eq:
    29  	MOVD $1, R0
    30  	RET
    31  
    32  // input:
    33  // R0: pointer a
    34  // R1: pointer b
    35  // R2: data len
    36  // at return: result in R0
    37  TEXT memeqbody<>(SB),NOSPLIT,$0
    38  	CMP $1, R2
    39  	// handle 1-byte special case for better performance
    40  	BEQ one
    41  	CMP $16, R2
    42  	// handle specially if length < 16
    43  	BLO tail
    44  	BIC $0x3f, R2, R3
    45  	CBZ R3, chunk16
    46  	// work with 64-byte chunks
    47  	ADD R3, R0, R6 // end of chunks
    48  chunk64_loop:
    49  	VLD1.P (R0), [V0.D2, V1.D2, V2.D2, V3.D2]
    50  	VLD1.P (R1), [V4.D2, V5.D2, V6.D2, V7.D2]
    51  	VCMEQ V0.D2, V4.D2, V8.D2
    52  	VCMEQ V1.D2, V5.D2, V9.D2
    53  	VCMEQ V2.D2, V6.D2, V10.D2
    54  	VCMEQ V3.D2, V7.D2, V11.D2
    55  	VAND V8.B16, V9.B16, V8.B16
    56  	VAND V8.B16, V10.B16, V8.B16
    57  	VAND V8.B16, V11.B16, V8.B16
    58  	CMP R0, R6
    59  	VMOV V8.D[0], R4
    60  	VMOV V8.D[1], R5
    61  	CBZ R4, not_equal
    62  	CBZ R5, not_equal
    63  	BNE chunk64_loop
    64  	AND $0x3f, R2, R2
    65  	CBZ R2, equal
    66  chunk16:
    67  	// work with 16-byte chunks
    68  	BIC $0xf, R2, R3
    69  	CBZ R3, tail
    70  	ADD R3, R0, R6 // end of chunks
    71  chunk16_loop:
    72  	LDP.P 16(R0), (R4, R5)
    73  	LDP.P 16(R1), (R7, R9)
    74  	EOR R4, R7
    75  	CBNZ R7, not_equal
    76  	EOR R5, R9
    77  	CBNZ R9, not_equal
    78  	CMP R0, R6
    79  	BNE chunk16_loop
    80  	AND $0xf, R2, R2
    81  	CBZ R2, equal
    82  tail:
    83  	// special compare of tail with length < 16
    84  	TBZ $3, R2, lt_8
    85  	MOVD (R0), R4
    86  	MOVD (R1), R5
    87  	EOR R4, R5
    88  	CBNZ R5, not_equal
    89  	SUB $8, R2, R6 // offset of the last 8 bytes
    90  	MOVD (R0)(R6), R4
    91  	MOVD (R1)(R6), R5
    92  	EOR R4, R5
    93  	CBNZ R5, not_equal
    94  	B equal
    95  lt_8:
    96  	TBZ $2, R2, lt_4
    97  	MOVWU (R0), R4
    98  	MOVWU (R1), R5
    99  	EOR R4, R5
   100  	CBNZ R5, not_equal
   101  	SUB $4, R2, R6 // offset of the last 4 bytes
   102  	MOVWU (R0)(R6), R4
   103  	MOVWU (R1)(R6), R5
   104  	EOR R4, R5
   105  	CBNZ R5, not_equal
   106  	B equal
   107  lt_4:
   108  	TBZ $1, R2, lt_2
   109  	MOVHU.P 2(R0), R4
   110  	MOVHU.P 2(R1), R5
   111  	CMP R4, R5
   112  	BNE not_equal
   113  lt_2:
   114  	TBZ $0, R2, equal
   115  one:
   116  	MOVBU (R0), R4
   117  	MOVBU (R1), R5
   118  	CMP R4, R5
   119  	BNE not_equal
   120  equal:
   121  	MOVD $1, R0
   122  	RET
   123  not_equal:
   124  	MOVB ZR, R0
   125  	RET