github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/indexbyte_ppc64x.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && (ppc64 || ppc64le) 9 10 #include "textflag.h" 11 12 TEXT ·IndexSliceByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 13 // R3 = byte array pointer 14 // R4 = length 15 MOVD R6, R5 // R5 = byte 16 MOVBZ ·isPOWER9(SB), R16 17 BR indexbytebody<>(SB) 18 19 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32 20 // R3 = string 21 // R4 = length 22 // R5 = byte 23 MOVBZ ·isPOWER9(SB), R16 24 BR indexbytebody<>(SB) 25 26 // R3 = addr of string 27 // R4 = len of string 28 // R5 = byte to find 29 // R16 = 1 if running on a POWER9 system, 0 otherwise 30 // On exit: 31 // R3 = return value 32 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 33 MOVD R3,R17 // Save base address for calculating the index later. 34 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. 35 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. 36 ADD R4,R3,R7 // Last acceptable address in R7. 37 38 RLDIMI $16,R5,$32,R5 39 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. 40 MOVD $-1,R9 41 RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8 42 RLDIMI $32,R5,$0,R5 43 MOVD R7,R10 // Save last acceptable address in R10 for later. 44 ADD $-1,R7,R7 45 #ifdef GOARCH_ppc64le 46 SLD R6,R9,R9 // Prepare mask for Little Endian 47 #else 48 SRD R6,R9,R9 // Same for Big Endian 49 #endif 50 BLT small_string // Jump to the small string case if it's <32 bytes. 51 CMP R16,$1 // optimize for power8 v power9 52 BNE power8 53 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ 54 MTVRD R5,V1 55 LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0} 56 VSLB V11,V10,V10 // to extract the first bit of match result into GPR 57 VSPLTB $7,V1,V1 // Replicate byte across V1 58 CMP R4,$64 59 MOVD $16,R11 60 MOVD R3,R8 61 BLT cmp32 62 MOVD $32,R12 63 MOVD $48,R6 64 65 loop64: 66 LXVB16X (R0)(R8),V2 // scan 64 bytes at a time 67 VCMPEQUBCC V2,V1,V6 68 BNE CR6,foundat0 // match found at R8, jump out 69 70 LXVB16X (R8)(R11),V2 71 VCMPEQUBCC V2,V1,V6 72 BNE CR6,foundat1 // match found at R8+16 bytes, jump out 73 74 LXVB16X (R8)(R12),V2 75 VCMPEQUBCC V2,V1,V6 76 BNE CR6,foundat2 // match found at R8+32 bytes, jump out 77 78 LXVB16X (R8)(R6),V2 79 VCMPEQUBCC V2,V1,V6 80 BNE CR6,foundat3 // match found at R8+48 bytes, jump out 81 ADD $64,R8 82 ADD $-64,R4 83 CMP R4,$64 // >=64 bytes left to scan? 84 BGE loop64 85 CMP R4,$32 86 BLT rem // jump to rem if there are < 32 bytes left 87 cmp32: 88 LXVB16X (R0)(R8),V2 // 32-63 bytes left 89 VCMPEQUBCC V2,V1,V6 90 BNE CR6,foundat0 // match found at R8 91 92 LXVB16X (R11)(R8),V2 93 VCMPEQUBCC V2,V1,V6 94 BNE CR6,foundat1 // match found at R8+16 95 96 ADD $32,R8 97 ADD $-32,R4 98 rem: 99 RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing 100 BR small_string 101 102 foundat3: 103 ADD $16,R8 104 foundat2: 105 ADD $16,R8 106 foundat1: 107 ADD $16,R8 108 foundat0: 109 // Compress the result into a single doubleword and 110 // move it to a GPR for the final calculation. 111 VBPERMQ V6,V10,V6 112 MFVRD V6,R3 113 // count leading zeroes upto the match that ends up in low 16 bits 114 // in both endian modes, compute index by subtracting the number by 16 115 CNTLZW R3,R11 116 ADD $-16,R11 117 ADD R8,R11,R3 // Calculate byte address 118 SUB R17,R3 119 RET 120 power8: 121 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values 122 // in V0, V1 and V10, then branch to the preloop. 123 ANDCC $63,R3,R11 124 BEQ CR0,qw_align 125 RLDICL $0,R3,$61,R11 126 127 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 128 CMPB R12,R5,R3 // Check for a match. 129 AND R9,R3,R3 // Mask bytes below s_base 130 RLDICR $0,R7,$60,R7 // Last doubleword in R7 131 CMPU R3,$0,CR7 // If we have a match, jump to the final computation 132 BNE CR7,done 133 ADD $8,R8,R8 134 ADD $-8,R4,R4 135 ADD R4,R11,R4 136 137 // Check for quadword alignment 138 ANDCC $15,R8,R11 139 BEQ CR0,qw_align 140 141 // Not aligned, so handle the next doubleword 142 MOVD 0(R8),R12 143 CMPB R12,R5,R3 144 CMPU R3,$0,CR7 145 BNE CR7,done 146 ADD $8,R8,R8 147 ADD $-8,R4,R4 148 149 // Either quadword aligned or 64-byte at this point. We can use LVX. 150 qw_align: 151 152 // Set up auxiliary data for the vectorized algorithm. 153 VSPLTISB $0,V0 // Replicate 0 across V0 154 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ 155 MTVRD R5,V1 156 LVSL (R0+R0),V11 157 VSLB V11,V10,V10 158 VSPLTB $7,V1,V1 // Replicate byte across V1 159 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop 160 BLE tail 161 162 // We will load 4 quardwords per iteration in the loop, so check for 163 // 64-byte alignment. If 64-byte aligned, then branch to the preloop. 164 ANDCC $63,R8,R11 165 BEQ CR0,preloop 166 167 // Not 64-byte aligned. Load one quadword at a time until aligned. 168 LVX (R8+R0),V4 169 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 170 BNE CR6,found_qw_align 171 ADD $16,R8,R8 172 ADD $-16,R4,R4 173 174 ANDCC $63,R8,R11 175 BEQ CR0,preloop 176 LVX (R8+R0),V4 177 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 178 BNE CR6,found_qw_align 179 ADD $16,R8,R8 180 ADD $-16,R4,R4 181 182 ANDCC $63,R8,R11 183 BEQ CR0,preloop 184 LVX (R8+R0),V4 185 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 186 BNE CR6,found_qw_align 187 ADD $-16,R4,R4 188 ADD $16,R8,R8 189 190 // 64-byte aligned. Prepare for the main loop. 191 preloop: 192 CMPU R4,$64 193 BLE tail // If len ≤ 64, don't use the vectorized loop 194 195 // We are now aligned to a 64-byte boundary. We will load 4 quadwords 196 // per loop iteration. The last doubleword is in R10, so our loop counter 197 // starts at (R10-R8)/64. 198 SUB R8,R10,R6 199 SRD $6,R6,R9 // Loop counter in R9 200 MOVD R9,CTR 201 202 ADD $-64,R8,R8 // Adjust index for loop entry 203 MOVD $16,R11 // Load offsets for the vector loads 204 MOVD $32,R9 205 MOVD $48,R7 206 207 // Main loop we will load 64 bytes per iteration 208 loop: 209 ADD $64,R8,R8 // Fuse addi+lvx for performance 210 LVX (R8+R0),V2 // Load 4 16-byte vectors 211 LVX (R8+R11),V3 212 VCMPEQUB V1,V2,V6 // Look for byte in each vector 213 VCMPEQUB V1,V3,V7 214 215 LVX (R8+R9),V4 216 LVX (R8+R7),V5 217 VCMPEQUB V1,V4,V8 218 VCMPEQUB V1,V5,V9 219 220 VOR V6,V7,V11 // Compress the result in a single vector 221 VOR V8,V9,V12 222 VOR V11,V12,V13 223 VCMPEQUBCC V0,V13,V14 // Check for byte 224 BGE CR6,found 225 BC 16,0,loop // bdnz loop 226 227 // Handle the tailing bytes or R4 ≤ 64 228 RLDICL $0,R6,$58,R4 229 ADD $64,R8,R8 230 tail: 231 CMPU R4,$0 232 BEQ notfound 233 LVX (R8+R0),V4 234 VCMPEQUBCC V1,V4,V6 235 BNE CR6,found_qw_align 236 ADD $16,R8,R8 237 CMPU R4,$16,CR6 238 BLE CR6,notfound 239 ADD $-16,R4,R4 240 241 LVX (R8+R0),V4 242 VCMPEQUBCC V1,V4,V6 243 BNE CR6,found_qw_align 244 ADD $16,R8,R8 245 CMPU R4,$16,CR6 246 BLE CR6,notfound 247 ADD $-16,R4,R4 248 249 LVX (R8+R0),V4 250 VCMPEQUBCC V1,V4,V6 251 BNE CR6,found_qw_align 252 ADD $16,R8,R8 253 CMPU R4,$16,CR6 254 BLE CR6,notfound 255 ADD $-16,R4,R4 256 257 LVX (R8+R0),V4 258 VCMPEQUBCC V1,V4,V6 259 BNE CR6,found_qw_align 260 261 notfound: 262 MOVD $-1, R3 263 RET 264 265 found: 266 // We will now compress the results into a single doubleword, 267 // so it can be moved to a GPR for the final index calculation. 268 269 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the 270 // first bit of each byte into bits 48-63. 271 VBPERMQ V6,V10,V6 272 VBPERMQ V7,V10,V7 273 VBPERMQ V8,V10,V8 274 VBPERMQ V9,V10,V9 275 276 // Shift each 16-bit component into its correct position for 277 // merging into a single doubleword. 278 #ifdef GOARCH_ppc64le 279 VSLDOI $2,V7,V7,V7 280 VSLDOI $4,V8,V8,V8 281 VSLDOI $6,V9,V9,V9 282 #else 283 VSLDOI $6,V6,V6,V6 284 VSLDOI $4,V7,V7,V7 285 VSLDOI $2,V8,V8,V8 286 #endif 287 288 // Merge V6-V9 into a single doubleword and move to a GPR. 289 VOR V6,V7,V11 290 VOR V8,V9,V4 291 VOR V4,V11,V4 292 MFVRD V4,R3 293 294 #ifdef GOARCH_ppc64le 295 ADD $-1,R3,R11 296 ANDN R3,R11,R11 297 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 298 #else 299 CNTLZD R3,R11 // Count leading zeros (Big Endian). 300 #endif 301 ADD R8,R11,R3 // Calculate byte address 302 303 return: 304 SUB R17, R3 305 RET 306 307 found_qw_align: 308 // Use the same algorithm as above. Compress the result into 309 // a single doubleword and move it to a GPR for the final 310 // calculation. 311 VBPERMQ V6,V10,V6 312 313 #ifdef GOARCH_ppc64le 314 MFVRD V6,R3 315 ADD $-1,R3,R11 316 ANDN R3,R11,R11 317 POPCNTD R11,R11 318 #else 319 VSLDOI $6,V6,V6,V6 320 MFVRD V6,R3 321 CNTLZD R3,R11 322 #endif 323 ADD R8,R11,R3 324 CMPU R11,R4 325 BLT return 326 BR notfound 327 PCALIGN $16 328 329 done: 330 ADD $-1,R10,R6 331 // Offset of last index for the final 332 // doubleword comparison 333 RLDICL $0,R6,$61,R6 334 // At this point, R3 has 0xFF in the same position as the byte we are 335 // looking for in the doubleword. Use that to calculate the exact index 336 // of the byte. 337 #ifdef GOARCH_ppc64le 338 ADD $-1,R3,R11 339 ANDN R3,R11,R11 340 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 341 #else 342 CNTLZD R3,R11 // Count leading zeros (Big Endian). 343 #endif 344 CMPU R8,R7 // Check if we are at the last doubleword. 345 SRD $3,R11 // Convert trailing zeros to bytes. 346 ADD R11,R8,R3 347 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. 348 BNE return 349 BLE CR7,return 350 BR notfound 351 352 small_string: 353 // process string of length < 32 bytes 354 // We unroll this loop for better performance. 355 CMPU R4,$0 // Check for length=0 356 BEQ notfound 357 358 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 359 CMPB R12,R5,R3 // Check for a match. 360 AND R9,R3,R3 // Mask bytes below s_base. 361 CMPU R3,$0,CR7 // If we have a match, jump to the final computation. 362 RLDICR $0,R7,$60,R7 // Last doubleword in R7. 363 CMPU R8,R7 364 BNE CR7,done 365 BEQ notfound // Hit length. 366 367 MOVDU 8(R8),R12 368 CMPB R12,R5,R3 369 CMPU R3,$0,CR6 370 CMPU R8,R7 371 BNE CR6,done 372 BEQ notfound 373 374 MOVDU 8(R8),R12 375 CMPB R12,R5,R3 376 CMPU R3,$0,CR6 377 CMPU R8,R7 378 BNE CR6,done 379 BEQ notfound 380 381 MOVDU 8(R8),R12 382 CMPB R12,R5,R3 383 CMPU R3,$0,CR6 384 CMPU R8,R7 385 BNE CR6,done 386 BEQ notfound 387 388 MOVDU 8(R8),R12 389 CMPB R12,R5,R3 390 CMPU R3,$0,CR6 391 BNE CR6,done 392 BR notfound 393