github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/indexbyte_amd64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && amd64 9 10 #include "textflag.h" 11 12 TEXT ·IndexSliceByte(SB),NOSPLIT,$0-40 13 MOVQ b_base+0(FP), SI 14 MOVQ b_len+8(FP), BX 15 MOVB c+24(FP), AL 16 LEAQ ret+32(FP), R8 17 JMP indexbytebody<>(SB) 18 19 TEXT ·IndexByte(SB),NOSPLIT,$0-32 20 MOVQ s_base+0(FP), SI 21 MOVQ s_len+8(FP), BX 22 MOVB c+16(FP), AL 23 LEAQ ret+24(FP), R8 24 JMP indexbytebody<>(SB) 25 26 // input: 27 // SI: data 28 // BX: data len 29 // AL: byte sought 30 // R8: address to put result 31 TEXT indexbytebody<>(SB), NOSPLIT, $0 32 // Shuffle X0 around so that each byte contains 33 // the character we're looking for. 34 MOVD AX, X0 35 PUNPCKLBW X0, X0 36 PUNPCKLBW X0, X0 37 PSHUFL $0, X0, X0 38 39 CMPQ BX, $16 40 JLT small 41 42 MOVQ SI, DI 43 44 CMPQ BX, $32 45 JA avx2 46 sse: 47 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes 48 JMP sseloopentry 49 50 sseloop: 51 // Move the next 16-byte chunk of the data into X1. 52 MOVOU (DI), X1 53 // Compare bytes in X0 to X1. 54 PCMPEQB X0, X1 55 // Take the top bit of each byte in X1 and put the result in DX. 56 PMOVMSKB X1, DX 57 // Find first set bit, if any. 58 BSFL DX, DX 59 JNZ ssesuccess 60 // Advance to next block. 61 ADDQ $16, DI 62 sseloopentry: 63 CMPQ DI, AX 64 JB sseloop 65 66 // Search the last 16-byte chunk. This chunk may overlap with the 67 // chunks we've already searched, but that's ok. 68 MOVQ AX, DI 69 MOVOU (AX), X1 70 PCMPEQB X0, X1 71 PMOVMSKB X1, DX 72 BSFL DX, DX 73 JNZ ssesuccess 74 75 failure: 76 MOVQ $-1, (R8) 77 RET 78 79 // We've found a chunk containing the byte. 80 // The chunk was loaded from DI. 81 // The index of the matching byte in the chunk is DX. 82 // The start of the data is SI. 83 ssesuccess: 84 SUBQ SI, DI // Compute offset of chunk within data. 85 ADDQ DX, DI // Add offset of byte within chunk. 86 MOVQ DI, (R8) 87 RET 88 89 // handle for lengths < 16 90 small: 91 TESTQ BX, BX 92 JEQ failure 93 94 // Check if we'll load across a page boundary. 95 LEAQ 16(SI), AX 96 TESTW $0xff0, AX 97 JEQ endofpage 98 99 MOVOU (SI), X1 // Load data 100 PCMPEQB X0, X1 // Compare target byte with each byte in data. 101 PMOVMSKB X1, DX // Move result bits to integer register. 102 BSFL DX, DX // Find first set bit. 103 JZ failure // No set bit, failure. 104 CMPL DX, BX 105 JAE failure // Match is past end of data. 106 MOVQ DX, (R8) 107 RET 108 109 endofpage: 110 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. 111 PCMPEQB X0, X1 // Compare target byte with each byte in data. 112 PMOVMSKB X1, DX // Move result bits to integer register. 113 MOVL BX, CX 114 SHLL CX, DX 115 SHRL $16, DX // Shift desired bits down to bottom of register. 116 BSFL DX, DX // Find first set bit. 117 JZ failure // No set bit, failure. 118 MOVQ DX, (R8) 119 RET 120 121 avx2: 122 CMPB ·hasAVX2(SB), $1 123 JNE sse 124 MOVD AX, X0 125 LEAQ -32(SI)(BX*1), R11 126 VPBROADCASTB X0, Y1 127 avx2_loop: 128 VMOVDQU (DI), Y2 129 VPCMPEQB Y1, Y2, Y3 130 VPTEST Y3, Y3 131 JNZ avx2success 132 ADDQ $32, DI 133 CMPQ DI, R11 134 JLT avx2_loop 135 MOVQ R11, DI 136 VMOVDQU (DI), Y2 137 VPCMPEQB Y1, Y2, Y3 138 VPTEST Y3, Y3 139 JNZ avx2success 140 VZEROUPPER 141 MOVQ $-1, (R8) 142 RET 143 144 avx2success: 145 VPMOVMSKB Y3, DX 146 BSFL DX, DX 147 SUBQ SI, DI 148 ADDQ DI, DX 149 MOVQ DX, (R8) 150 VZEROUPPER 151 RET