github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/internal/bytealg/count_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "asm_amd64.h" 7 #include "textflag.h" 8 9 TEXT ·Count(SB),NOSPLIT,$0-40 10 #ifndef hasPOPCNT 11 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 12 JEQ 2(PC) 13 JMP ·countGeneric(SB) 14 #endif 15 MOVQ b_base+0(FP), SI 16 MOVQ b_len+8(FP), BX 17 MOVB c+24(FP), AL 18 LEAQ ret+32(FP), R8 19 JMP countbody<>(SB) 20 21 TEXT ·CountString(SB),NOSPLIT,$0-32 22 #ifndef hasPOPCNT 23 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 24 JEQ 2(PC) 25 JMP ·countGenericString(SB) 26 #endif 27 MOVQ s_base+0(FP), SI 28 MOVQ s_len+8(FP), BX 29 MOVB c+16(FP), AL 30 LEAQ ret+24(FP), R8 31 JMP countbody<>(SB) 32 33 // input: 34 // SI: data 35 // BX: data len 36 // AL: byte sought 37 // R8: address to put result 38 // This function requires the POPCNT instruction. 39 TEXT countbody<>(SB),NOSPLIT,$0 40 // Shuffle X0 around so that each byte contains 41 // the character we're looking for. 42 MOVD AX, X0 43 PUNPCKLBW X0, X0 44 PUNPCKLBW X0, X0 45 PSHUFL $0, X0, X0 46 47 CMPQ BX, $16 48 JLT small 49 50 MOVQ $0, R12 // Accumulator 51 52 MOVQ SI, DI 53 54 CMPQ BX, $32 55 JA avx2 56 sse: 57 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 58 JMP sseloopentry 59 60 sseloop: 61 // Move the next 16-byte chunk of the data into X1. 62 MOVOU (DI), X1 63 // Compare bytes in X0 to X1. 64 PCMPEQB X0, X1 65 // Take the top bit of each byte in X1 and put the result in DX. 66 PMOVMSKB X1, DX 67 // Count number of matching bytes 68 POPCNTL DX, DX 69 // Accumulate into R12 70 ADDQ DX, R12 71 // Advance to next block. 72 ADDQ $16, DI 73 sseloopentry: 74 CMPQ DI, AX 75 JBE sseloop 76 77 // Get the number of bytes to consider in the last 16 bytes 78 ANDQ $15, BX 79 JZ end 80 81 // Create mask to ignore overlap between previous 16 byte block 82 // and the next. 83 MOVQ $16,CX 84 SUBQ BX, CX 85 MOVQ $0xFFFF, R10 86 SARQ CL, R10 87 SALQ CL, R10 88 89 // Process the last 16-byte chunk. This chunk may overlap with the 90 // chunks we've already searched so we need to mask part of it. 91 MOVOU (AX), X1 92 PCMPEQB X0, X1 93 PMOVMSKB X1, DX 94 // Apply mask 95 ANDQ R10, DX 96 POPCNTL DX, DX 97 ADDQ DX, R12 98 end: 99 MOVQ R12, (R8) 100 RET 101 102 // handle for lengths < 16 103 small: 104 TESTQ BX, BX 105 JEQ endzero 106 107 // Check if we'll load across a page boundary. 108 LEAQ 16(SI), AX 109 TESTW $0xff0, AX 110 JEQ endofpage 111 112 // We must ignore high bytes as they aren't part of our slice. 113 // Create mask. 114 MOVB BX, CX 115 MOVQ $1, R10 116 SALQ CL, R10 117 SUBQ $1, R10 118 119 // Load data 120 MOVOU (SI), X1 121 // Compare target byte with each byte in data. 122 PCMPEQB X0, X1 123 // Move result bits to integer register. 124 PMOVMSKB X1, DX 125 // Apply mask 126 ANDQ R10, DX 127 POPCNTL DX, DX 128 // Directly return DX, we don't need to accumulate 129 // since we have <16 bytes. 130 MOVQ DX, (R8) 131 RET 132 endzero: 133 MOVQ $0, (R8) 134 RET 135 136 endofpage: 137 // We must ignore low bytes as they aren't part of our slice. 138 MOVQ $16,CX 139 SUBQ BX, CX 140 MOVQ $0xFFFF, R10 141 SARQ CL, R10 142 SALQ CL, R10 143 144 // Load data into the high end of X1. 145 MOVOU -16(SI)(BX*1), X1 146 // Compare target byte with each byte in data. 147 PCMPEQB X0, X1 148 // Move result bits to integer register. 149 PMOVMSKB X1, DX 150 // Apply mask 151 ANDQ R10, DX 152 // Directly return DX, we don't need to accumulate 153 // since we have <16 bytes. 154 POPCNTL DX, DX 155 MOVQ DX, (R8) 156 RET 157 158 avx2: 159 #ifndef hasAVX2 160 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 161 JNE sse 162 #endif 163 MOVD AX, X0 164 LEAQ -32(SI)(BX*1), R11 165 VPBROADCASTB X0, Y1 166 avx2_loop: 167 VMOVDQU (DI), Y2 168 VPCMPEQB Y1, Y2, Y3 169 VPMOVMSKB Y3, DX 170 POPCNTL DX, DX 171 ADDQ DX, R12 172 ADDQ $32, DI 173 CMPQ DI, R11 174 JLE avx2_loop 175 176 // If last block is already processed, 177 // skip to the end. 178 CMPQ DI, R11 179 JEQ endavx 180 181 // Load address of the last 32 bytes. 182 // There is an overlap with the previous block. 183 MOVQ R11, DI 184 VMOVDQU (DI), Y2 185 VPCMPEQB Y1, Y2, Y3 186 VPMOVMSKB Y3, DX 187 // Exit AVX mode. 188 VZEROUPPER 189 190 // Create mask to ignore overlap between previous 32 byte block 191 // and the next. 192 ANDQ $31, BX 193 MOVQ $32,CX 194 SUBQ BX, CX 195 MOVQ $0xFFFFFFFF, R10 196 SARQ CL, R10 197 SALQ CL, R10 198 // Apply mask 199 ANDQ R10, DX 200 POPCNTL DX, DX 201 ADDQ DX, R12 202 MOVQ R12, (R8) 203 RET 204 endavx: 205 // Exit AVX mode. 206 VZEROUPPER 207 MOVQ R12, (R8) 208 RET