github.com/primecitizens/pcz/std@v0.2.1/core/mem/equal_amd64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && amd64 9 10 #include "textflag.h" 11 12 // Equal(a, b unsafe.Pointer, size uintptr) bool 13 TEXT ·Equal<ABIInternal>(SB),NOSPLIT,$0-25 14 // AX = a (want in SI) 15 // BX = b (want in DI) 16 // CX = size (want in BX) 17 CMPQ AX, BX 18 JNE neq 19 MOVQ $1, AX // return 1 20 RET 21 neq: 22 MOVQ AX, SI 23 MOVQ BX, DI 24 MOVQ CX, BX 25 JMP memeqbody<>(SB) 26 27 // memequal_varlen(a, b unsafe.Pointer) bool 28 TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17 29 // AX = a (want in SI) 30 // BX = b (want in DI) 31 // 8(DX) = size (want in BX) 32 CMPQ AX, BX 33 JNE neq 34 MOVQ $1, AX // return 1 35 RET 36 neq: 37 MOVQ AX, SI 38 MOVQ BX, DI 39 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 40 JMP memeqbody<>(SB) 41 42 // Input: 43 // a in SI 44 // b in DI 45 // count in BX 46 // Output: 47 // result in AX 48 TEXT memeqbody<>(SB),NOSPLIT,$0-0 49 CMPQ BX, $8 50 JB small 51 CMPQ BX, $64 52 JB bigloop 53 #ifndef hasAVX2 54 CMPB ·hasAVX2(SB), $1 55 JE hugeloop_avx2 56 57 // 64 bytes at a time using xmm registers 58 hugeloop: 59 CMPQ BX, $64 60 JB bigloop 61 MOVOU (SI), X0 62 MOVOU (DI), X1 63 MOVOU 16(SI), X2 64 MOVOU 16(DI), X3 65 MOVOU 32(SI), X4 66 MOVOU 32(DI), X5 67 MOVOU 48(SI), X6 68 MOVOU 48(DI), X7 69 PCMPEQB X1, X0 70 PCMPEQB X3, X2 71 PCMPEQB X5, X4 72 PCMPEQB X7, X6 73 PAND X2, X0 74 PAND X6, X4 75 PAND X4, X0 76 PMOVMSKB X0, DX 77 ADDQ $64, SI 78 ADDQ $64, DI 79 SUBQ $64, BX 80 CMPL DX, $0xffff 81 JEQ hugeloop 82 XORQ AX, AX // return 0 83 RET 84 #endif 85 86 // 64 bytes at a time using ymm registers 87 hugeloop_avx2: 88 CMPQ BX, $64 89 JB bigloop_avx2 90 VMOVDQU (SI), Y0 91 VMOVDQU (DI), Y1 92 VMOVDQU 32(SI), Y2 93 VMOVDQU 32(DI), Y3 94 VPCMPEQB Y1, Y0, Y4 95 VPCMPEQB Y2, Y3, Y5 96 VPAND Y4, Y5, Y6 97 VPMOVMSKB Y6, DX 98 ADDQ $64, SI 99 ADDQ $64, DI 100 SUBQ $64, BX 101 CMPL DX, $0xffffffff 102 JEQ hugeloop_avx2 103 VZEROUPPER 104 XORQ AX, AX // return 0 105 RET 106 107 bigloop_avx2: 108 VZEROUPPER 109 110 // 8 bytes at a time using 64-bit register 111 bigloop: 112 CMPQ BX, $8 113 JBE leftover 114 MOVQ (SI), CX 115 MOVQ (DI), DX 116 ADDQ $8, SI 117 ADDQ $8, DI 118 SUBQ $8, BX 119 CMPQ CX, DX 120 JEQ bigloop 121 XORQ AX, AX // return 0 122 RET 123 124 // remaining 0-8 bytes 125 leftover: 126 MOVQ -8(SI)(BX*1), CX 127 MOVQ -8(DI)(BX*1), DX 128 CMPQ CX, DX 129 SETEQ AX 130 RET 131 132 small: 133 CMPQ BX, $0 134 JEQ equal 135 136 LEAQ 0(BX*8), CX 137 NEGQ CX 138 139 CMPB SI, $0xf8 140 JA si_high 141 142 // load at SI won't cross a page boundary. 143 MOVQ (SI), SI 144 JMP si_finish 145 si_high: 146 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 147 MOVQ -8(SI)(BX*1), SI 148 SHRQ CX, SI 149 si_finish: 150 151 // same for DI. 152 CMPB DI, $0xf8 153 JA di_high 154 MOVQ (DI), DI 155 JMP di_finish 156 di_high: 157 MOVQ -8(DI)(BX*1), DI 158 SHRQ CX, DI 159 di_finish: 160 161 SUBQ SI, DI 162 SHLQ CX, DI 163 equal: 164 SETEQ AX 165 RET