github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/count_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "asm_amd64.h" 7 #include "textflag.h" 8 9 TEXT ·Count(SB),NOSPLIT,$0-40 10 #ifndef hasPOPCNT 11 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 12 JEQ 2(PC) 13 JMP ·countGeneric(SB) 14 #endif 15 MOVQ b_base+0(FP), SI 16 MOVQ b_len+8(FP), BX 17 MOVB c+24(FP), AL 18 LEAQ ret+32(FP), R8 19 JMP countbody<>(SB) 20 21 TEXT ·CountString(SB),NOSPLIT,$0-32 22 #ifndef hasPOPCNT 23 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 24 JEQ 2(PC) 25 JMP ·countGenericString(SB) 26 #endif 27 MOVQ s_base+0(FP), SI 28 MOVQ s_len+8(FP), BX 29 MOVB c+16(FP), AL 30 LEAQ ret+24(FP), R8 31 JMP countbody<>(SB) 32 33 // input: 34 // SI: data 35 // BX: data len 36 // AL: byte sought 37 // R8: address to put result 38 // This function requires the POPCNT instruction. 39 TEXT countbody<>(SB),NOSPLIT,$0 40 // Shuffle X0 around so that each byte contains 41 // the character we're looking for. 42 MOVD AX, X0 43 PUNPCKLBW X0, X0 44 PUNPCKLBW X0, X0 45 PSHUFL $0, X0, X0 46 47 CMPQ BX, $16 48 JLT small 49 50 MOVQ $0, R12 // Accumulator 51 52 MOVQ SI, DI 53 54 CMPQ BX, $64 55 JAE avx2 56 sse: 57 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 58 JMP sseloopentry 59 60 PCALIGN $16 61 sseloop: 62 // Move the next 16-byte chunk of the data into X1. 63 MOVOU (DI), X1 64 // Compare bytes in X0 to X1. 65 PCMPEQB X0, X1 66 // Take the top bit of each byte in X1 and put the result in DX. 67 PMOVMSKB X1, DX 68 // Count number of matching bytes 69 POPCNTL DX, DX 70 // Accumulate into R12 71 ADDQ DX, R12 72 // Advance to next block. 73 ADDQ $16, DI 74 sseloopentry: 75 CMPQ DI, AX 76 JBE sseloop 77 78 // Get the number of bytes to consider in the last 16 bytes 79 ANDQ $15, BX 80 JZ end 81 82 // Create mask to ignore overlap between previous 16 byte block 83 // and the next. 84 MOVQ $16,CX 85 SUBQ BX, CX 86 MOVQ $0xFFFF, R10 87 SARQ CL, R10 88 SALQ CL, R10 89 90 // Process the last 16-byte chunk. This chunk may overlap with the 91 // chunks we've already searched so we need to mask part of it. 92 MOVOU (AX), X1 93 PCMPEQB X0, X1 94 PMOVMSKB X1, DX 95 // Apply mask 96 ANDQ R10, DX 97 POPCNTL DX, DX 98 ADDQ DX, R12 99 end: 100 MOVQ R12, (R8) 101 RET 102 103 // handle for lengths < 16 104 small: 105 TESTQ BX, BX 106 JEQ endzero 107 108 // Check if we'll load across a page boundary. 109 LEAQ 16(SI), AX 110 TESTW $0xff0, AX 111 JEQ endofpage 112 113 // We must ignore high bytes as they aren't part of our slice. 114 // Create mask. 115 MOVB BX, CX 116 MOVQ $1, R10 117 SALQ CL, R10 118 SUBQ $1, R10 119 120 // Load data 121 MOVOU (SI), X1 122 // Compare target byte with each byte in data. 123 PCMPEQB X0, X1 124 // Move result bits to integer register. 125 PMOVMSKB X1, DX 126 // Apply mask 127 ANDQ R10, DX 128 POPCNTL DX, DX 129 // Directly return DX, we don't need to accumulate 130 // since we have <16 bytes. 131 MOVQ DX, (R8) 132 RET 133 endzero: 134 MOVQ $0, (R8) 135 RET 136 137 endofpage: 138 // We must ignore low bytes as they aren't part of our slice. 139 MOVQ $16,CX 140 SUBQ BX, CX 141 MOVQ $0xFFFF, R10 142 SARQ CL, R10 143 SALQ CL, R10 144 145 // Load data into the high end of X1. 146 MOVOU -16(SI)(BX*1), X1 147 // Compare target byte with each byte in data. 148 PCMPEQB X0, X1 149 // Move result bits to integer register. 150 PMOVMSKB X1, DX 151 // Apply mask 152 ANDQ R10, DX 153 // Directly return DX, we don't need to accumulate 154 // since we have <16 bytes. 155 POPCNTL DX, DX 156 MOVQ DX, (R8) 157 RET 158 159 avx2: 160 #ifndef hasAVX2 161 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 162 JNE sse 163 #endif 164 MOVD AX, X0 165 LEAQ -64(SI)(BX*1), R11 166 LEAQ (SI)(BX*1), R13 167 VPBROADCASTB X0, Y1 168 PCALIGN $32 169 avx2_loop: 170 VMOVDQU (DI), Y2 171 VMOVDQU 32(DI), Y4 172 VPCMPEQB Y1, Y2, Y3 173 VPCMPEQB Y1, Y4, Y5 174 VPMOVMSKB Y3, DX 175 VPMOVMSKB Y5, CX 176 POPCNTL DX, DX 177 POPCNTL CX, CX 178 ADDQ DX, R12 179 ADDQ CX, R12 180 ADDQ $64, DI 181 CMPQ DI, R11 182 JLE avx2_loop 183 184 // If last block is already processed, 185 // skip to the end. 186 // 187 // This check is NOT an optimization; if the input length is a 188 // multiple of 64, we must not go through the last leg of the 189 // function because the bit shift count passed to SALQ below would 190 // be 64, which is outside of the 0-63 range supported by those 191 // instructions. 192 // 193 // Tests in the bytes and strings packages with input lengths that 194 // are multiples of 64 will break if this condition were removed. 195 CMPQ DI, R13 196 JEQ endavx 197 198 // Load address of the last 64 bytes. 199 // There is an overlap with the previous block. 200 MOVQ R11, DI 201 VMOVDQU (DI), Y2 202 VMOVDQU 32(DI), Y4 203 VPCMPEQB Y1, Y2, Y3 204 VPCMPEQB Y1, Y4, Y5 205 VPMOVMSKB Y3, DX 206 VPMOVMSKB Y5, CX 207 // Exit AVX mode. 208 VZEROUPPER 209 SALQ $32, CX 210 ORQ CX, DX 211 212 // Create mask to ignore overlap between previous 64 byte block 213 // and the next. 214 ANDQ $63, BX 215 MOVQ $64, CX 216 SUBQ BX, CX 217 MOVQ $0xFFFFFFFFFFFFFFFF, R10 218 SALQ CL, R10 219 // Apply mask 220 ANDQ R10, DX 221 POPCNTQ DX, DX 222 ADDQ DX, R12 223 MOVQ R12, (R8) 224 RET 225 endavx: 226 // Exit AVX mode. 227 VZEROUPPER 228 MOVQ R12, (R8) 229 RET