github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/bytealg/indexbyte_ppc64x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ppc64 ppc64le 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 11 MOVD b_base+0(FP), R3 // R3 = byte array pointer 12 MOVD b_len+8(FP), R4 // R4 = length 13 MOVBZ c+24(FP), R5 // R5 = byte 14 MOVD $ret+32(FP), R14 // R14 = &ret 15 BR indexbytebody<>(SB) 16 17 TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 18 MOVD s_base+0(FP), R3 // R3 = string 19 MOVD s_len+8(FP), R4 // R4 = length 20 MOVBZ c+16(FP), R5 // R5 = byte 21 MOVD $ret+24(FP), R14 // R14 = &ret 22 BR indexbytebody<>(SB) 23 24 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 25 MOVD R3,R17 // Save base address for calculating the index later. 26 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. 27 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. 28 ADD R4,R3,R7 // Last acceptable address in R7. 29 DCBT (R8) // Prepare cache line. 30 31 RLDIMI $16,R5,$32,R5 32 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. 33 MOVD $-1,R9 34 WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). 35 RLDIMI $32,R5,$0,R5 36 MOVD R7,R10 // Save last acceptable address in R10 for later. 37 ADD $-1,R7,R7 38 #ifdef GOARCH_ppc64le 39 SLD R6,R9,R9 // Prepare mask for Little Endian 40 #else 41 SRD R6,R9,R9 // Same for Big Endian 42 #endif 43 BLE small_string // Jump to the small string case if it's ≤32 bytes. 44 45 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values 46 // in V0, V1 and V10, then branch to the preloop. 47 ANDCC $63,R3,R11 48 BEQ CR0,qw_align 49 RLDICL $0,R3,$61,R11 50 51 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 52 CMPB R12,R5,R3 // Check for a match. 53 AND R9,R3,R3 // Mask bytes below s_base 54 RLDICL $0,R7,$61,R6 // length-1 55 RLDICR $0,R7,$60,R7 // Last doubleword in R7 56 CMPU R3,$0,CR7 // If we have a match, jump to the final computation 57 BNE CR7,done 58 ADD $8,R8,R8 59 ADD $-8,R4,R4 60 ADD R4,R11,R4 61 62 // Check for quadword alignment 63 ANDCC $15,R8,R11 64 BEQ CR0,qw_align 65 66 // Not aligned, so handle the next doubleword 67 MOVD 0(R8),R12 68 CMPB R12,R5,R3 69 CMPU R3,$0,CR7 70 BNE CR7,done 71 ADD $8,R8,R8 72 ADD $-8,R4,R4 73 74 // Either quadword aligned or 64-byte at this point. We can use LVX. 75 qw_align: 76 77 // Set up auxiliary data for the vectorized algorithm. 78 VSPLTISB $0,V0 // Replicate 0 across V0 79 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ 80 MTVRD R5,V1 81 LVSL (R0+R0),V11 82 VSLB V11,V10,V10 83 VSPLTB $7,V1,V1 // Replicate byte across V1 84 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop 85 BLE tail 86 87 // We will load 4 quardwords per iteration in the loop, so check for 88 // 64-byte alignment. If 64-byte aligned, then branch to the preloop. 89 ANDCC $63,R8,R11 90 BEQ CR0,preloop 91 92 // Not 64-byte aligned. Load one quadword at a time until aligned. 93 LVX (R8+R0),V4 94 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 95 BNE CR6,found_qw_align 96 ADD $16,R8,R8 97 ADD $-16,R4,R4 98 99 ANDCC $63,R8,R11 100 BEQ CR0,preloop 101 LVX (R8+R0),V4 102 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 103 BNE CR6,found_qw_align 104 ADD $16,R8,R8 105 ADD $-16,R4,R4 106 107 ANDCC $63,R8,R11 108 BEQ CR0,preloop 109 LVX (R8+R0),V4 110 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 111 BNE CR6,found_qw_align 112 ADD $-16,R4,R4 113 ADD $16,R8,R8 114 115 // 64-byte aligned. Prepare for the main loop. 116 preloop: 117 CMPU R4,$64 118 BLE tail // If len ≤ 64, don't use the vectorized loop 119 120 // We are now aligned to a 64-byte boundary. We will load 4 quadwords 121 // per loop iteration. The last doubleword is in R10, so our loop counter 122 // starts at (R10-R8)/64. 123 SUB R8,R10,R6 124 SRD $6,R6,R9 // Loop counter in R9 125 MOVD R9,CTR 126 127 ADD $-64,R8,R8 // Adjust index for loop entry 128 MOVD $16,R11 // Load offsets for the vector loads 129 MOVD $32,R9 130 MOVD $48,R7 131 132 // Main loop we will load 64 bytes per iteration 133 loop: 134 ADD $64,R8,R8 // Fuse addi+lvx for performance 135 LVX (R8+R0),V2 // Load 4 16-byte vectors 136 LVX (R8+R11),V3 137 VCMPEQUB V1,V2,V6 // Look for byte in each vector 138 VCMPEQUB V1,V3,V7 139 140 LVX (R8+R9),V4 141 LVX (R8+R7),V5 142 VCMPEQUB V1,V4,V8 143 VCMPEQUB V1,V5,V9 144 145 VOR V6,V7,V11 // Compress the result in a single vector 146 VOR V8,V9,V12 147 VOR V11,V12,V13 148 VCMPEQUBCC V0,V13,V14 // Check for byte 149 BGE CR6,found 150 BC 16,0,loop // bdnz loop 151 152 // Handle the tailing bytes or R4 ≤ 64 153 RLDICL $0,R6,$58,R4 154 ADD $64,R8,R8 155 tail: 156 CMPU R4,$0 157 BEQ notfound 158 LVX (R8+R0),V4 159 VCMPEQUBCC V1,V4,V6 160 BNE CR6,found_qw_align 161 ADD $16,R8,R8 162 CMPU R4,$16,CR6 163 BLE CR6,notfound 164 ADD $-16,R4,R4 165 166 LVX (R8+R0),V4 167 VCMPEQUBCC V1,V4,V6 168 BNE CR6,found_qw_align 169 ADD $16,R8,R8 170 CMPU R4,$16,CR6 171 BLE CR6,notfound 172 ADD $-16,R4,R4 173 174 LVX (R8+R0),V4 175 VCMPEQUBCC V1,V4,V6 176 BNE CR6,found_qw_align 177 ADD $16,R8,R8 178 CMPU R4,$16,CR6 179 BLE CR6,notfound 180 ADD $-16,R4,R4 181 182 LVX (R8+R0),V4 183 VCMPEQUBCC V1,V4,V6 184 BNE CR6,found_qw_align 185 186 notfound: 187 MOVD $-1,R3 188 MOVD R3,(R14) 189 RET 190 191 found: 192 // We will now compress the results into a single doubleword, 193 // so it can be moved to a GPR for the final index calculation. 194 195 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the 196 // first bit of each byte into bits 48-63. 197 VBPERMQ V6,V10,V6 198 VBPERMQ V7,V10,V7 199 VBPERMQ V8,V10,V8 200 VBPERMQ V9,V10,V9 201 202 // Shift each 16-bit component into its correct position for 203 // merging into a single doubleword. 204 #ifdef GOARCH_ppc64le 205 VSLDOI $2,V7,V7,V7 206 VSLDOI $4,V8,V8,V8 207 VSLDOI $6,V9,V9,V9 208 #else 209 VSLDOI $6,V6,V6,V6 210 VSLDOI $4,V7,V7,V7 211 VSLDOI $2,V8,V8,V8 212 #endif 213 214 // Merge V6-V9 into a single doubleword and move to a GPR. 215 VOR V6,V7,V11 216 VOR V8,V9,V4 217 VOR V4,V11,V4 218 MFVRD V4,R3 219 220 #ifdef GOARCH_ppc64le 221 ADD $-1,R3,R11 222 ANDN R3,R11,R11 223 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 224 #else 225 CNTLZD R3,R11 // Count leading zeros (Big Endian). 226 #endif 227 ADD R8,R11,R3 // Calculate byte address 228 229 return: 230 SUB R17,R3 231 MOVD R3,(R14) 232 RET 233 234 found_qw_align: 235 // Use the same algorithm as above. Compress the result into 236 // a single doubleword and move it to a GPR for the final 237 // calculation. 238 VBPERMQ V6,V10,V6 239 240 #ifdef GOARCH_ppc64le 241 MFVRD V6,R3 242 ADD $-1,R3,R11 243 ANDN R3,R11,R11 244 POPCNTD R11,R11 245 #else 246 VSLDOI $6,V6,V6,V6 247 MFVRD V6,R3 248 CNTLZD R3,R11 249 #endif 250 ADD R8,R11,R3 251 CMPU R11,R4 252 BLT return 253 BR notfound 254 255 done: 256 // At this point, R3 has 0xFF in the same position as the byte we are 257 // looking for in the doubleword. Use that to calculate the exact index 258 // of the byte. 259 #ifdef GOARCH_ppc64le 260 ADD $-1,R3,R11 261 ANDN R3,R11,R11 262 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 263 #else 264 CNTLZD R3,R11 // Count leading zeros (Big Endian). 265 #endif 266 CMPU R8,R7 // Check if we are at the last doubleword. 267 SRD $3,R11 // Convert trailing zeros to bytes. 268 ADD R11,R8,R3 269 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. 270 BNE return 271 BLE CR7,return 272 BR notfound 273 274 small_string: 275 // We unroll this loop for better performance. 276 CMPU R4,$0 // Check for length=0 277 BEQ notfound 278 279 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 280 CMPB R12,R5,R3 // Check for a match. 281 AND R9,R3,R3 // Mask bytes below s_base. 282 CMPU R3,$0,CR7 // If we have a match, jump to the final computation. 283 RLDICL $0,R7,$61,R6 // length-1 284 RLDICR $0,R7,$60,R7 // Last doubleword in R7. 285 CMPU R8,R7 286 BNE CR7,done 287 BEQ notfound // Hit length. 288 289 MOVDU 8(R8),R12 290 CMPB R12,R5,R3 291 CMPU R3,$0,CR6 292 CMPU R8,R7 293 BNE CR6,done 294 BEQ notfound 295 296 MOVDU 8(R8),R12 297 CMPB R12,R5,R3 298 CMPU R3,$0,CR6 299 CMPU R8,R7 300 BNE CR6,done 301 BEQ notfound 302 303 MOVDU 8(R8),R12 304 CMPB R12,R5,R3 305 CMPU R3,$0,CR6 306 CMPU R8,R7 307 BNE CR6,done 308 BEQ notfound 309 310 MOVDU 8(R8),R12 311 CMPB R12,R5,R3 312 CMPU R3,$0,CR6 313 BNE CR6,done 314 BR notfound 315