github.com/SandwichDev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/count_amd64.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 TEXT ·Count(SB),NOSPLIT,$0-40 9 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 10 JEQ 2(PC) 11 JMP ·countGeneric(SB) 12 MOVQ b_base+0(FP), SI 13 MOVQ b_len+8(FP), BX 14 MOVB c+24(FP), AL 15 LEAQ ret+32(FP), R8 16 JMP countbody<>(SB) 17 18 TEXT ·CountString(SB),NOSPLIT,$0-32 19 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 20 JEQ 2(PC) 21 JMP ·countGenericString(SB) 22 MOVQ s_base+0(FP), SI 23 MOVQ s_len+8(FP), BX 24 MOVB c+16(FP), AL 25 LEAQ ret+24(FP), R8 26 JMP countbody<>(SB) 27 28 // input: 29 // SI: data 30 // BX: data len 31 // AL: byte sought 32 // R8: address to put result 33 // This function requires the POPCNT instruction. 34 TEXT countbody<>(SB),NOSPLIT,$0 35 // Shuffle X0 around so that each byte contains 36 // the character we're looking for. 37 MOVD AX, X0 38 PUNPCKLBW X0, X0 39 PUNPCKLBW X0, X0 40 PSHUFL $0, X0, X0 41 42 CMPQ BX, $16 43 JLT small 44 45 MOVQ $0, R12 // Accumulator 46 47 MOVQ SI, DI 48 49 CMPQ BX, $32 50 JA avx2 51 sse: 52 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 53 JMP sseloopentry 54 55 sseloop: 56 // Move the next 16-byte chunk of the data into X1. 57 MOVOU (DI), X1 58 // Compare bytes in X0 to X1. 59 PCMPEQB X0, X1 60 // Take the top bit of each byte in X1 and put the result in DX. 61 PMOVMSKB X1, DX 62 // Count number of matching bytes 63 POPCNTL DX, DX 64 // Accumulate into R12 65 ADDQ DX, R12 66 // Advance to next block. 67 ADDQ $16, DI 68 sseloopentry: 69 CMPQ DI, AX 70 JBE sseloop 71 72 // Get the number of bytes to consider in the last 16 bytes 73 ANDQ $15, BX 74 JZ end 75 76 // Create mask to ignore overlap between previous 16 byte block 77 // and the next. 78 MOVQ $16,CX 79 SUBQ BX, CX 80 MOVQ $0xFFFF, R10 81 SARQ CL, R10 82 SALQ CL, R10 83 84 // Process the last 16-byte chunk. This chunk may overlap with the 85 // chunks we've already searched so we need to mask part of it. 86 MOVOU (AX), X1 87 PCMPEQB X0, X1 88 PMOVMSKB X1, DX 89 // Apply mask 90 ANDQ R10, DX 91 POPCNTL DX, DX 92 ADDQ DX, R12 93 end: 94 MOVQ R12, (R8) 95 RET 96 97 // handle for lengths < 16 98 small: 99 TESTQ BX, BX 100 JEQ endzero 101 102 // Check if we'll load across a page boundary. 103 LEAQ 16(SI), AX 104 TESTW $0xff0, AX 105 JEQ endofpage 106 107 // We must ignore high bytes as they aren't part of our slice. 108 // Create mask. 109 MOVB BX, CX 110 MOVQ $1, R10 111 SALQ CL, R10 112 SUBQ $1, R10 113 114 // Load data 115 MOVOU (SI), X1 116 // Compare target byte with each byte in data. 117 PCMPEQB X0, X1 118 // Move result bits to integer register. 119 PMOVMSKB X1, DX 120 // Apply mask 121 ANDQ R10, DX 122 POPCNTL DX, DX 123 // Directly return DX, we don't need to accumulate 124 // since we have <16 bytes. 125 MOVQ DX, (R8) 126 RET 127 endzero: 128 MOVQ $0, (R8) 129 RET 130 131 endofpage: 132 // We must ignore low bytes as they aren't part of our slice. 133 MOVQ $16,CX 134 SUBQ BX, CX 135 MOVQ $0xFFFF, R10 136 SARQ CL, R10 137 SALQ CL, R10 138 139 // Load data into the high end of X1. 140 MOVOU -16(SI)(BX*1), X1 141 // Compare target byte with each byte in data. 142 PCMPEQB X0, X1 143 // Move result bits to integer register. 144 PMOVMSKB X1, DX 145 // Apply mask 146 ANDQ R10, DX 147 // Directly return DX, we don't need to accumulate 148 // since we have <16 bytes. 149 POPCNTL DX, DX 150 MOVQ DX, (R8) 151 RET 152 153 avx2: 154 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 155 JNE sse 156 MOVD AX, X0 157 LEAQ -32(SI)(BX*1), R11 158 VPBROADCASTB X0, Y1 159 avx2_loop: 160 VMOVDQU (DI), Y2 161 VPCMPEQB Y1, Y2, Y3 162 VPMOVMSKB Y3, DX 163 POPCNTL DX, DX 164 ADDQ DX, R12 165 ADDQ $32, DI 166 CMPQ DI, R11 167 JLE avx2_loop 168 169 // If last block is already processed, 170 // skip to the end. 171 CMPQ DI, R11 172 JEQ endavx 173 174 // Load address of the last 32 bytes. 175 // There is an overlap with the previous block. 176 MOVQ R11, DI 177 VMOVDQU (DI), Y2 178 VPCMPEQB Y1, Y2, Y3 179 VPMOVMSKB Y3, DX 180 // Exit AVX mode. 181 VZEROUPPER 182 183 // Create mask to ignore overlap between previous 32 byte block 184 // and the next. 185 ANDQ $31, BX 186 MOVQ $32,CX 187 SUBQ BX, CX 188 MOVQ $0xFFFFFFFF, R10 189 SARQ CL, R10 190 SALQ CL, R10 191 // Apply mask 192 ANDQ R10, DX 193 POPCNTL DX, DX 194 ADDQ DX, R12 195 MOVQ R12, (R8) 196 RET 197 endavx: 198 // Exit AVX mode. 199 VZEROUPPER 200 MOVQ R12, (R8) 201 RET