github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/bytealg/equal_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Equal(SB),NOSPLIT,$0-49 9 MOVQ a_len+8(FP), BX 10 MOVQ b_len+32(FP), CX 11 CMPQ BX, CX 12 JNE neq 13 MOVQ a_base+0(FP), SI 14 MOVQ b_base+24(FP), DI 15 CMPQ SI, DI 16 JEQ eq 17 LEAQ ret+48(FP), AX 18 JMP memeqbody<>(SB) 19 neq: 20 MOVB $0, ret+48(FP) 21 RET 22 eq: 23 MOVB $1, ret+48(FP) 24 RET 25 26 // memequal(a, b unsafe.Pointer, size uintptr) bool 27 TEXT runtime·memequal(SB),NOSPLIT,$0-25 28 MOVQ a+0(FP), SI 29 MOVQ b+8(FP), DI 30 CMPQ SI, DI 31 JEQ eq 32 MOVQ size+16(FP), BX 33 LEAQ ret+24(FP), AX 34 JMP memeqbody<>(SB) 35 eq: 36 MOVB $1, ret+24(FP) 37 RET 38 39 // memequal_varlen(a, b unsafe.Pointer) bool 40 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 41 MOVQ a+0(FP), SI 42 MOVQ b+8(FP), DI 43 CMPQ SI, DI 44 JEQ eq 45 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure 46 LEAQ ret+16(FP), AX 47 JMP memeqbody<>(SB) 48 eq: 49 MOVB $1, ret+16(FP) 50 RET 51 52 // a in SI 53 // b in DI 54 // count in BX 55 // address of result byte in AX 56 TEXT memeqbody<>(SB),NOSPLIT,$0-0 57 CMPQ BX, $8 58 JB small 59 CMPQ BX, $64 60 JB bigloop 61 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 62 JE hugeloop_avx2 63 64 // 64 bytes at a time using xmm registers 65 hugeloop: 66 CMPQ BX, $64 67 JB bigloop 68 MOVOU (SI), X0 69 MOVOU (DI), X1 70 MOVOU 16(SI), X2 71 MOVOU 16(DI), X3 72 MOVOU 32(SI), X4 73 MOVOU 32(DI), X5 74 MOVOU 48(SI), X6 75 MOVOU 48(DI), X7 76 PCMPEQB X1, X0 77 PCMPEQB X3, X2 78 PCMPEQB X5, X4 79 PCMPEQB X7, X6 80 PAND X2, X0 81 PAND X6, X4 82 PAND X4, X0 83 PMOVMSKB X0, DX 84 ADDQ $64, SI 85 ADDQ $64, DI 86 SUBQ $64, BX 87 CMPL DX, $0xffff 88 JEQ hugeloop 89 MOVB $0, (AX) 90 RET 91 92 // 64 bytes at a time using ymm registers 93 hugeloop_avx2: 94 CMPQ BX, $64 95 JB bigloop_avx2 96 VMOVDQU (SI), Y0 97 VMOVDQU (DI), Y1 98 VMOVDQU 32(SI), Y2 99 VMOVDQU 32(DI), Y3 100 VPCMPEQB Y1, Y0, Y4 101 VPCMPEQB Y2, Y3, Y5 102 VPAND Y4, Y5, Y6 103 VPMOVMSKB Y6, DX 104 ADDQ $64, SI 105 ADDQ $64, DI 106 SUBQ $64, BX 107 CMPL DX, $0xffffffff 108 JEQ hugeloop_avx2 109 VZEROUPPER 110 MOVB $0, (AX) 111 RET 112 113 bigloop_avx2: 114 VZEROUPPER 115 116 // 8 bytes at a time using 64-bit register 117 bigloop: 118 CMPQ BX, $8 119 JBE leftover 120 MOVQ (SI), CX 121 MOVQ (DI), DX 122 ADDQ $8, SI 123 ADDQ $8, DI 124 SUBQ $8, BX 125 CMPQ CX, DX 126 JEQ bigloop 127 MOVB $0, (AX) 128 RET 129 130 // remaining 0-8 bytes 131 leftover: 132 MOVQ -8(SI)(BX*1), CX 133 MOVQ -8(DI)(BX*1), DX 134 CMPQ CX, DX 135 SETEQ (AX) 136 RET 137 138 small: 139 CMPQ BX, $0 140 JEQ equal 141 142 LEAQ 0(BX*8), CX 143 NEGQ CX 144 145 CMPB SI, $0xf8 146 JA si_high 147 148 // load at SI won't cross a page boundary. 149 MOVQ (SI), SI 150 JMP si_finish 151 si_high: 152 // address ends in 11111xxx. Load up to bytes we want, move to correct position. 153 MOVQ -8(SI)(BX*1), SI 154 SHRQ CX, SI 155 si_finish: 156 157 // same for DI. 158 CMPB DI, $0xf8 159 JA di_high 160 MOVQ (DI), DI 161 JMP di_finish 162 di_high: 163 MOVQ -8(DI)(BX*1), DI 164 SHRQ CX, DI 165 di_finish: 166 167 SUBQ SI, DI 168 SHLQ CX, DI 169 equal: 170 SETEQ (AX) 171 RET 172