github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/count_amd64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && amd64 9 10 #include "textflag.h" 11 12 TEXT ·CountSlice(SB),NOSPLIT,$0-40 13 CMPB ·hasPOPCNT(SB), $1 14 JEQ 2(PC) 15 JMP ·countGeneric(SB) 16 MOVQ b_base+0(FP), SI 17 MOVQ b_len+8(FP), BX 18 MOVB c+24(FP), AL 19 LEAQ ret+32(FP), R8 20 JMP countbody<>(SB) 21 22 TEXT ·Count(SB),NOSPLIT,$0-32 23 CMPB ·hasPOPCNT(SB), $1 24 JEQ 2(PC) 25 JMP ·countGenericString(SB) 26 MOVQ s_base+0(FP), SI 27 MOVQ s_len+8(FP), BX 28 MOVB c+16(FP), AL 29 LEAQ ret+24(FP), R8 30 JMP countbody<>(SB) 31 32 // input: 33 // SI: data 34 // BX: data len 35 // AL: byte sought 36 // R8: address to put result 37 // This function requires the POPCNT instruction. 38 TEXT countbody<>(SB),NOSPLIT,$0 39 // Shuffle X0 around so that each byte contains 40 // the character we're looking for. 41 MOVD AX, X0 42 PUNPCKLBW X0, X0 43 PUNPCKLBW X0, X0 44 PSHUFL $0, X0, X0 45 46 CMPQ BX, $16 47 JLT small 48 49 MOVQ $0, R12 // Accumulator 50 51 MOVQ SI, DI 52 53 CMPQ BX, $32 54 JA avx2 55 sse: 56 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 57 JMP sseloopentry 58 59 sseloop: 60 // Move the next 16-byte chunk of the data into X1. 61 MOVOU (DI), X1 62 // Compare bytes in X0 to X1. 63 PCMPEQB X0, X1 64 // Take the top bit of each byte in X1 and put the result in DX. 65 PMOVMSKB X1, DX 66 // Count number of matching bytes 67 POPCNTL DX, DX 68 // Accumulate into R12 69 ADDQ DX, R12 70 // Advance to next block. 71 ADDQ $16, DI 72 sseloopentry: 73 CMPQ DI, AX 74 JBE sseloop 75 76 // Get the number of bytes to consider in the last 16 bytes 77 ANDQ $15, BX 78 JZ end 79 80 // Create mask to ignore overlap between previous 16 byte block 81 // and the next. 82 MOVQ $16,CX 83 SUBQ BX, CX 84 MOVQ $0xFFFF, R10 85 SARQ CL, R10 86 SALQ CL, R10 87 88 // Process the last 16-byte chunk. This chunk may overlap with the 89 // chunks we've already searched so we need to mask part of it. 90 MOVOU (AX), X1 91 PCMPEQB X0, X1 92 PMOVMSKB X1, DX 93 // Apply mask 94 ANDQ R10, DX 95 POPCNTL DX, DX 96 ADDQ DX, R12 97 end: 98 MOVQ R12, (R8) 99 RET 100 101 // handle for lengths < 16 102 small: 103 TESTQ BX, BX 104 JEQ endzero 105 106 // Check if we'll load across a page boundary. 107 LEAQ 16(SI), AX 108 TESTW $0xff0, AX 109 JEQ endofpage 110 111 // We must ignore high bytes as they aren't part of our slice. 112 // Create mask. 113 MOVB BX, CX 114 MOVQ $1, R10 115 SALQ CL, R10 116 SUBQ $1, R10 117 118 // Load data 119 MOVOU (SI), X1 120 // Compare target byte with each byte in data. 121 PCMPEQB X0, X1 122 // Move result bits to integer register. 123 PMOVMSKB X1, DX 124 // Apply mask 125 ANDQ R10, DX 126 POPCNTL DX, DX 127 // Directly return DX, we don't need to accumulate 128 // since we have <16 bytes. 129 MOVQ DX, (R8) 130 RET 131 endzero: 132 MOVQ $0, (R8) 133 RET 134 135 endofpage: 136 // We must ignore low bytes as they aren't part of our slice. 137 MOVQ $16,CX 138 SUBQ BX, CX 139 MOVQ $0xFFFF, R10 140 SARQ CL, R10 141 SALQ CL, R10 142 143 // Load data into the high end of X1. 144 MOVOU -16(SI)(BX*1), X1 145 // Compare target byte with each byte in data. 146 PCMPEQB X0, X1 147 // Move result bits to integer register. 148 PMOVMSKB X1, DX 149 // Apply mask 150 ANDQ R10, DX 151 // Directly return DX, we don't need to accumulate 152 // since we have <16 bytes. 153 POPCNTL DX, DX 154 MOVQ DX, (R8) 155 RET 156 157 avx2: 158 CMPB ·hasAVX2(SB), $1 159 JNE sse 160 MOVD AX, X0 161 LEAQ -32(SI)(BX*1), R11 162 VPBROADCASTB X0, Y1 163 avx2_loop: 164 VMOVDQU (DI), Y2 165 VPCMPEQB Y1, Y2, Y3 166 VPMOVMSKB Y3, DX 167 POPCNTL DX, DX 168 ADDQ DX, R12 169 ADDQ $32, DI 170 CMPQ DI, R11 171 JLE avx2_loop 172 173 // If last block is already processed, 174 // skip to the end. 175 CMPQ DI, R11 176 JEQ endavx 177 178 // Load address of the last 32 bytes. 179 // There is an overlap with the previous block. 180 MOVQ R11, DI 181 VMOVDQU (DI), Y2 182 VPCMPEQB Y1, Y2, Y3 183 VPMOVMSKB Y3, DX 184 // Exit AVX mode. 185 VZEROUPPER 186 187 // Create mask to ignore overlap between previous 32 byte block 188 // and the next. 189 ANDQ $31, BX 190 MOVQ $32,CX 191 SUBQ BX, CX 192 MOVQ $0xFFFFFFFF, R10 193 SARQ CL, R10 194 SALQ CL, R10 195 // Apply mask 196 ANDQ R10, DX 197 POPCNTL DX, DX 198 ADDQ DX, R12 199 MOVQ R12, (R8) 200 RET 201 endavx: 202 // Exit AVX mode. 203 VZEROUPPER 204 MOVQ R12, (R8) 205 RET