github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/internal/bytealg/indexbyte_ppc64x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ppc64 || ppc64le 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 11 // R3 = byte array pointer 12 // R4 = length 13 MOVD R6, R5 // R5 = byte 14 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 15 BR indexbytebody<>(SB) 16 17 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32 18 // R3 = string 19 // R4 = length 20 // R5 = byte 21 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 22 BR indexbytebody<>(SB) 23 24 // R3 = addr of string 25 // R4 = len of string 26 // R5 = byte to find 27 // R16 = 1 if running on a POWER9 system, 0 otherwise 28 // On exit: 29 // R3 = return value 30 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 31 MOVD R3,R17 // Save base address for calculating the index later. 32 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. 33 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. 34 ADD R4,R3,R7 // Last acceptable address in R7. 35 36 RLDIMI $16,R5,$32,R5 37 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. 38 MOVD $-1,R9 39 RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8 40 RLDIMI $32,R5,$0,R5 41 MOVD R7,R10 // Save last acceptable address in R10 for later. 42 ADD $-1,R7,R7 43 #ifdef GOARCH_ppc64le 44 SLD R6,R9,R9 // Prepare mask for Little Endian 45 #else 46 SRD R6,R9,R9 // Same for Big Endian 47 #endif 48 BLT small_string // Jump to the small string case if it's <32 bytes. 49 CMP R16,$1 // optimize for power8 v power9 50 BNE power8 51 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ 52 MTVRD R5,V1 53 LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0} 54 VSLB V11,V10,V10 // to extract the first bit of match result into GPR 55 VSPLTB $7,V1,V1 // Replicate byte across V1 56 CMP R4,$64 57 MOVD $16,R11 58 MOVD R3,R8 59 BLT cmp32 60 MOVD $32,R12 61 MOVD $48,R6 62 63 loop64: 64 LXVB16X (R0)(R8),V2 // scan 64 bytes at a time 65 VCMPEQUBCC V2,V1,V6 66 BNE CR6,foundat0 // match found at R8, jump out 67 68 LXVB16X (R8)(R11),V2 69 VCMPEQUBCC V2,V1,V6 70 BNE CR6,foundat1 // match found at R8+16 bytes, jump out 71 72 LXVB16X (R8)(R12),V2 73 VCMPEQUBCC V2,V1,V6 74 BNE CR6,foundat2 // match found at R8+32 bytes, jump out 75 76 LXVB16X (R8)(R6),V2 77 VCMPEQUBCC V2,V1,V6 78 BNE CR6,foundat3 // match found at R8+48 bytes, jump out 79 ADD $64,R8 80 ADD $-64,R4 81 CMP R4,$64 // >=64 bytes left to scan? 82 BGE loop64 83 CMP R4,$32 84 BLT rem // jump to rem if there are < 32 bytes left 85 cmp32: 86 LXVB16X (R0)(R8),V2 // 32-63 bytes left 87 VCMPEQUBCC V2,V1,V6 88 BNE CR6,foundat0 // match found at R8 89 90 LXVB16X (R11)(R8),V2 91 VCMPEQUBCC V2,V1,V6 92 BNE CR6,foundat1 // match found at R8+16 93 94 ADD $32,R8 95 ADD $-32,R4 96 rem: 97 RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing 98 BR small_string 99 100 foundat3: 101 ADD $16,R8 102 foundat2: 103 ADD $16,R8 104 foundat1: 105 ADD $16,R8 106 foundat0: 107 // Compress the result into a single doubleword and 108 // move it to a GPR for the final calculation. 109 VBPERMQ V6,V10,V6 110 MFVRD V6,R3 111 // count leading zeroes upto the match that ends up in low 16 bits 112 // in both endian modes, compute index by subtracting the number by 16 113 CNTLZW R3,R11 114 ADD $-16,R11 115 ADD R8,R11,R3 // Calculate byte address 116 SUB R17,R3 117 RET 118 power8: 119 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values 120 // in V0, V1 and V10, then branch to the preloop. 121 ANDCC $63,R3,R11 122 BEQ CR0,qw_align 123 RLDICL $0,R3,$61,R11 124 125 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 126 CMPB R12,R5,R3 // Check for a match. 127 AND R9,R3,R3 // Mask bytes below s_base 128 RLDICR $0,R7,$60,R7 // Last doubleword in R7 129 CMPU R3,$0,CR7 // If we have a match, jump to the final computation 130 BNE CR7,done 131 ADD $8,R8,R8 132 ADD $-8,R4,R4 133 ADD R4,R11,R4 134 135 // Check for quadword alignment 136 ANDCC $15,R8,R11 137 BEQ CR0,qw_align 138 139 // Not aligned, so handle the next doubleword 140 MOVD 0(R8),R12 141 CMPB R12,R5,R3 142 CMPU R3,$0,CR7 143 BNE CR7,done 144 ADD $8,R8,R8 145 ADD $-8,R4,R4 146 147 // Either quadword aligned or 64-byte at this point. We can use LVX. 148 qw_align: 149 150 // Set up auxiliary data for the vectorized algorithm. 151 VSPLTISB $0,V0 // Replicate 0 across V0 152 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ 153 MTVRD R5,V1 154 LVSL (R0+R0),V11 155 VSLB V11,V10,V10 156 VSPLTB $7,V1,V1 // Replicate byte across V1 157 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop 158 BLE tail 159 160 // We will load 4 quardwords per iteration in the loop, so check for 161 // 64-byte alignment. If 64-byte aligned, then branch to the preloop. 162 ANDCC $63,R8,R11 163 BEQ CR0,preloop 164 165 // Not 64-byte aligned. Load one quadword at a time until aligned. 166 LVX (R8+R0),V4 167 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 168 BNE CR6,found_qw_align 169 ADD $16,R8,R8 170 ADD $-16,R4,R4 171 172 ANDCC $63,R8,R11 173 BEQ CR0,preloop 174 LVX (R8+R0),V4 175 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 176 BNE CR6,found_qw_align 177 ADD $16,R8,R8 178 ADD $-16,R4,R4 179 180 ANDCC $63,R8,R11 181 BEQ CR0,preloop 182 LVX (R8+R0),V4 183 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 184 BNE CR6,found_qw_align 185 ADD $-16,R4,R4 186 ADD $16,R8,R8 187 188 // 64-byte aligned. Prepare for the main loop. 189 preloop: 190 CMPU R4,$64 191 BLE tail // If len ≤ 64, don't use the vectorized loop 192 193 // We are now aligned to a 64-byte boundary. We will load 4 quadwords 194 // per loop iteration. The last doubleword is in R10, so our loop counter 195 // starts at (R10-R8)/64. 196 SUB R8,R10,R6 197 SRD $6,R6,R9 // Loop counter in R9 198 MOVD R9,CTR 199 200 ADD $-64,R8,R8 // Adjust index for loop entry 201 MOVD $16,R11 // Load offsets for the vector loads 202 MOVD $32,R9 203 MOVD $48,R7 204 205 // Main loop we will load 64 bytes per iteration 206 loop: 207 ADD $64,R8,R8 // Fuse addi+lvx for performance 208 LVX (R8+R0),V2 // Load 4 16-byte vectors 209 LVX (R8+R11),V3 210 VCMPEQUB V1,V2,V6 // Look for byte in each vector 211 VCMPEQUB V1,V3,V7 212 213 LVX (R8+R9),V4 214 LVX (R8+R7),V5 215 VCMPEQUB V1,V4,V8 216 VCMPEQUB V1,V5,V9 217 218 VOR V6,V7,V11 // Compress the result in a single vector 219 VOR V8,V9,V12 220 VOR V11,V12,V13 221 VCMPEQUBCC V0,V13,V14 // Check for byte 222 BGE CR6,found 223 BC 16,0,loop // bdnz loop 224 225 // Handle the tailing bytes or R4 ≤ 64 226 RLDICL $0,R6,$58,R4 227 ADD $64,R8,R8 228 tail: 229 CMPU R4,$0 230 BEQ notfound 231 LVX (R8+R0),V4 232 VCMPEQUBCC V1,V4,V6 233 BNE CR6,found_qw_align 234 ADD $16,R8,R8 235 CMPU R4,$16,CR6 236 BLE CR6,notfound 237 ADD $-16,R4,R4 238 239 LVX (R8+R0),V4 240 VCMPEQUBCC V1,V4,V6 241 BNE CR6,found_qw_align 242 ADD $16,R8,R8 243 CMPU R4,$16,CR6 244 BLE CR6,notfound 245 ADD $-16,R4,R4 246 247 LVX (R8+R0),V4 248 VCMPEQUBCC V1,V4,V6 249 BNE CR6,found_qw_align 250 ADD $16,R8,R8 251 CMPU R4,$16,CR6 252 BLE CR6,notfound 253 ADD $-16,R4,R4 254 255 LVX (R8+R0),V4 256 VCMPEQUBCC V1,V4,V6 257 BNE CR6,found_qw_align 258 259 notfound: 260 MOVD $-1, R3 261 RET 262 263 found: 264 // We will now compress the results into a single doubleword, 265 // so it can be moved to a GPR for the final index calculation. 266 267 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the 268 // first bit of each byte into bits 48-63. 269 VBPERMQ V6,V10,V6 270 VBPERMQ V7,V10,V7 271 VBPERMQ V8,V10,V8 272 VBPERMQ V9,V10,V9 273 274 // Shift each 16-bit component into its correct position for 275 // merging into a single doubleword. 276 #ifdef GOARCH_ppc64le 277 VSLDOI $2,V7,V7,V7 278 VSLDOI $4,V8,V8,V8 279 VSLDOI $6,V9,V9,V9 280 #else 281 VSLDOI $6,V6,V6,V6 282 VSLDOI $4,V7,V7,V7 283 VSLDOI $2,V8,V8,V8 284 #endif 285 286 // Merge V6-V9 into a single doubleword and move to a GPR. 287 VOR V6,V7,V11 288 VOR V8,V9,V4 289 VOR V4,V11,V4 290 MFVRD V4,R3 291 292 #ifdef GOARCH_ppc64le 293 ADD $-1,R3,R11 294 ANDN R3,R11,R11 295 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 296 #else 297 CNTLZD R3,R11 // Count leading zeros (Big Endian). 298 #endif 299 ADD R8,R11,R3 // Calculate byte address 300 301 return: 302 SUB R17, R3 303 RET 304 305 found_qw_align: 306 // Use the same algorithm as above. Compress the result into 307 // a single doubleword and move it to a GPR for the final 308 // calculation. 309 VBPERMQ V6,V10,V6 310 311 #ifdef GOARCH_ppc64le 312 MFVRD V6,R3 313 ADD $-1,R3,R11 314 ANDN R3,R11,R11 315 POPCNTD R11,R11 316 #else 317 VSLDOI $6,V6,V6,V6 318 MFVRD V6,R3 319 CNTLZD R3,R11 320 #endif 321 ADD R8,R11,R3 322 CMPU R11,R4 323 BLT return 324 BR notfound 325 PCALIGN $16 326 327 done: 328 ADD $-1,R10,R6 329 // Offset of last index for the final 330 // doubleword comparison 331 RLDICL $0,R6,$61,R6 332 // At this point, R3 has 0xFF in the same position as the byte we are 333 // looking for in the doubleword. Use that to calculate the exact index 334 // of the byte. 335 #ifdef GOARCH_ppc64le 336 ADD $-1,R3,R11 337 ANDN R3,R11,R11 338 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 339 #else 340 CNTLZD R3,R11 // Count leading zeros (Big Endian). 341 #endif 342 CMPU R8,R7 // Check if we are at the last doubleword. 343 SRD $3,R11 // Convert trailing zeros to bytes. 344 ADD R11,R8,R3 345 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. 346 BNE return 347 BLE CR7,return 348 BR notfound 349 350 small_string: 351 // process string of length < 32 bytes 352 // We unroll this loop for better performance. 353 CMPU R4,$0 // Check for length=0 354 BEQ notfound 355 356 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 357 CMPB R12,R5,R3 // Check for a match. 358 AND R9,R3,R3 // Mask bytes below s_base. 359 CMPU R3,$0,CR7 // If we have a match, jump to the final computation. 360 RLDICR $0,R7,$60,R7 // Last doubleword in R7. 361 CMPU R8,R7 362 BNE CR7,done 363 BEQ notfound // Hit length. 364 365 MOVDU 8(R8),R12 366 CMPB R12,R5,R3 367 CMPU R3,$0,CR6 368 CMPU R8,R7 369 BNE CR6,done 370 BEQ notfound 371 372 MOVDU 8(R8),R12 373 CMPB R12,R5,R3 374 CMPU R3,$0,CR6 375 CMPU R8,R7 376 BNE CR6,done 377 BEQ notfound 378 379 MOVDU 8(R8),R12 380 CMPB R12,R5,R3 381 CMPU R3,$0,CR6 382 CMPU R8,R7 383 BNE CR6,done 384 BEQ notfound 385 386 MOVDU 8(R8),R12 387 CMPB R12,R5,R3 388 CMPU R3,$0,CR6 389 BNE CR6,done 390 BR notfound 391