github.com/m10x/go/src@v0.0.0-20220112094212-ba61592315da/internal/bytealg/indexbyte_ppc64x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ppc64 || ppc64le 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 11 #ifndef GOEXPERIMENT_regabiargs 12 MOVD b_base+0(FP), R3 // R3 = byte array pointer 13 MOVD b_len+8(FP), R4 // R4 = length 14 MOVBZ c+24(FP), R5 // R5 = byte 15 MOVD $ret+32(FP), R14 // R14 = &ret 16 #else 17 MOVD R6, R5 18 #endif 19 BR indexbytebody<>(SB) 20 21 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32 22 #ifndef GOEXPERIMENT_regabiargs 23 MOVD s_base+0(FP), R3 // R3 = string 24 MOVD s_len+8(FP), R4 // R4 = length 25 MOVBZ c+16(FP), R5 // R5 = byte 26 MOVD $ret+24(FP), R14 // R14 = &ret 27 #endif 28 BR indexbytebody<>(SB) 29 // R3 = addr of string 30 // R4 = len of string 31 // R5 = byte to find 32 // R14 = addr of return value when not regabi 33 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 34 MOVD R3,R17 // Save base address for calculating the index later. 35 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. 36 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. 37 ADD R4,R3,R7 // Last acceptable address in R7. 38 DCBT (R8) // Prepare cache line. 39 40 RLDIMI $16,R5,$32,R5 41 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. 42 MOVD $-1,R9 43 WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). 44 RLDIMI $32,R5,$0,R5 45 MOVD R7,R10 // Save last acceptable address in R10 for later. 46 ADD $-1,R7,R7 47 #ifdef GOARCH_ppc64le 48 SLD R6,R9,R9 // Prepare mask for Little Endian 49 #else 50 SRD R6,R9,R9 // Same for Big Endian 51 #endif 52 BLE small_string // Jump to the small string case if it's ≤32 bytes. 53 54 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values 55 // in V0, V1 and V10, then branch to the preloop. 56 ANDCC $63,R3,R11 57 BEQ CR0,qw_align 58 RLDICL $0,R3,$61,R11 59 60 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 61 CMPB R12,R5,R3 // Check for a match. 62 AND R9,R3,R3 // Mask bytes below s_base 63 RLDICL $0,R7,$61,R6 // length-1 64 RLDICR $0,R7,$60,R7 // Last doubleword in R7 65 CMPU R3,$0,CR7 // If we have a match, jump to the final computation 66 BNE CR7,done 67 ADD $8,R8,R8 68 ADD $-8,R4,R4 69 ADD R4,R11,R4 70 71 // Check for quadword alignment 72 ANDCC $15,R8,R11 73 BEQ CR0,qw_align 74 75 // Not aligned, so handle the next doubleword 76 MOVD 0(R8),R12 77 CMPB R12,R5,R3 78 CMPU R3,$0,CR7 79 BNE CR7,done 80 ADD $8,R8,R8 81 ADD $-8,R4,R4 82 83 // Either quadword aligned or 64-byte at this point. We can use LVX. 84 qw_align: 85 86 // Set up auxiliary data for the vectorized algorithm. 87 VSPLTISB $0,V0 // Replicate 0 across V0 88 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ 89 MTVRD R5,V1 90 LVSL (R0+R0),V11 91 VSLB V11,V10,V10 92 VSPLTB $7,V1,V1 // Replicate byte across V1 93 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop 94 BLE tail 95 96 // We will load 4 quardwords per iteration in the loop, so check for 97 // 64-byte alignment. If 64-byte aligned, then branch to the preloop. 98 ANDCC $63,R8,R11 99 BEQ CR0,preloop 100 101 // Not 64-byte aligned. Load one quadword at a time until aligned. 102 LVX (R8+R0),V4 103 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 104 BNE CR6,found_qw_align 105 ADD $16,R8,R8 106 ADD $-16,R4,R4 107 108 ANDCC $63,R8,R11 109 BEQ CR0,preloop 110 LVX (R8+R0),V4 111 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 112 BNE CR6,found_qw_align 113 ADD $16,R8,R8 114 ADD $-16,R4,R4 115 116 ANDCC $63,R8,R11 117 BEQ CR0,preloop 118 LVX (R8+R0),V4 119 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 120 BNE CR6,found_qw_align 121 ADD $-16,R4,R4 122 ADD $16,R8,R8 123 124 // 64-byte aligned. Prepare for the main loop. 125 preloop: 126 CMPU R4,$64 127 BLE tail // If len ≤ 64, don't use the vectorized loop 128 129 // We are now aligned to a 64-byte boundary. We will load 4 quadwords 130 // per loop iteration. The last doubleword is in R10, so our loop counter 131 // starts at (R10-R8)/64. 132 SUB R8,R10,R6 133 SRD $6,R6,R9 // Loop counter in R9 134 MOVD R9,CTR 135 136 ADD $-64,R8,R8 // Adjust index for loop entry 137 MOVD $16,R11 // Load offsets for the vector loads 138 MOVD $32,R9 139 MOVD $48,R7 140 141 // Main loop we will load 64 bytes per iteration 142 loop: 143 ADD $64,R8,R8 // Fuse addi+lvx for performance 144 LVX (R8+R0),V2 // Load 4 16-byte vectors 145 LVX (R8+R11),V3 146 VCMPEQUB V1,V2,V6 // Look for byte in each vector 147 VCMPEQUB V1,V3,V7 148 149 LVX (R8+R9),V4 150 LVX (R8+R7),V5 151 VCMPEQUB V1,V4,V8 152 VCMPEQUB V1,V5,V9 153 154 VOR V6,V7,V11 // Compress the result in a single vector 155 VOR V8,V9,V12 156 VOR V11,V12,V13 157 VCMPEQUBCC V0,V13,V14 // Check for byte 158 BGE CR6,found 159 BC 16,0,loop // bdnz loop 160 161 // Handle the tailing bytes or R4 ≤ 64 162 RLDICL $0,R6,$58,R4 163 ADD $64,R8,R8 164 tail: 165 CMPU R4,$0 166 BEQ notfound 167 LVX (R8+R0),V4 168 VCMPEQUBCC V1,V4,V6 169 BNE CR6,found_qw_align 170 ADD $16,R8,R8 171 CMPU R4,$16,CR6 172 BLE CR6,notfound 173 ADD $-16,R4,R4 174 175 LVX (R8+R0),V4 176 VCMPEQUBCC V1,V4,V6 177 BNE CR6,found_qw_align 178 ADD $16,R8,R8 179 CMPU R4,$16,CR6 180 BLE CR6,notfound 181 ADD $-16,R4,R4 182 183 LVX (R8+R0),V4 184 VCMPEQUBCC V1,V4,V6 185 BNE CR6,found_qw_align 186 ADD $16,R8,R8 187 CMPU R4,$16,CR6 188 BLE CR6,notfound 189 ADD $-16,R4,R4 190 191 LVX (R8+R0),V4 192 VCMPEQUBCC V1,V4,V6 193 BNE CR6,found_qw_align 194 195 notfound: 196 MOVD $-1,R3 197 #ifndef GOEXPERIMENT_regabiargs 198 MOVD R3,(R14) 199 #endif 200 RET 201 202 found: 203 // We will now compress the results into a single doubleword, 204 // so it can be moved to a GPR for the final index calculation. 205 206 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the 207 // first bit of each byte into bits 48-63. 208 VBPERMQ V6,V10,V6 209 VBPERMQ V7,V10,V7 210 VBPERMQ V8,V10,V8 211 VBPERMQ V9,V10,V9 212 213 // Shift each 16-bit component into its correct position for 214 // merging into a single doubleword. 215 #ifdef GOARCH_ppc64le 216 VSLDOI $2,V7,V7,V7 217 VSLDOI $4,V8,V8,V8 218 VSLDOI $6,V9,V9,V9 219 #else 220 VSLDOI $6,V6,V6,V6 221 VSLDOI $4,V7,V7,V7 222 VSLDOI $2,V8,V8,V8 223 #endif 224 225 // Merge V6-V9 into a single doubleword and move to a GPR. 226 VOR V6,V7,V11 227 VOR V8,V9,V4 228 VOR V4,V11,V4 229 MFVRD V4,R3 230 231 #ifdef GOARCH_ppc64le 232 ADD $-1,R3,R11 233 ANDN R3,R11,R11 234 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 235 #else 236 CNTLZD R3,R11 // Count leading zeros (Big Endian). 237 #endif 238 ADD R8,R11,R3 // Calculate byte address 239 240 return: 241 SUB R17,R3 242 #ifndef GOEXPERIMENT_regabiargs 243 MOVD R3,(R14) 244 #endif 245 RET 246 247 found_qw_align: 248 // Use the same algorithm as above. Compress the result into 249 // a single doubleword and move it to a GPR for the final 250 // calculation. 251 VBPERMQ V6,V10,V6 252 253 #ifdef GOARCH_ppc64le 254 MFVRD V6,R3 255 ADD $-1,R3,R11 256 ANDN R3,R11,R11 257 POPCNTD R11,R11 258 #else 259 VSLDOI $6,V6,V6,V6 260 MFVRD V6,R3 261 CNTLZD R3,R11 262 #endif 263 ADD R8,R11,R3 264 CMPU R11,R4 265 BLT return 266 BR notfound 267 268 done: 269 // At this point, R3 has 0xFF in the same position as the byte we are 270 // looking for in the doubleword. Use that to calculate the exact index 271 // of the byte. 272 #ifdef GOARCH_ppc64le 273 ADD $-1,R3,R11 274 ANDN R3,R11,R11 275 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 276 #else 277 CNTLZD R3,R11 // Count leading zeros (Big Endian). 278 #endif 279 CMPU R8,R7 // Check if we are at the last doubleword. 280 SRD $3,R11 // Convert trailing zeros to bytes. 281 ADD R11,R8,R3 282 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. 283 BNE return 284 BLE CR7,return 285 BR notfound 286 287 small_string: 288 // We unroll this loop for better performance. 289 CMPU R4,$0 // Check for length=0 290 BEQ notfound 291 292 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 293 CMPB R12,R5,R3 // Check for a match. 294 AND R9,R3,R3 // Mask bytes below s_base. 295 CMPU R3,$0,CR7 // If we have a match, jump to the final computation. 296 RLDICL $0,R7,$61,R6 // length-1 297 RLDICR $0,R7,$60,R7 // Last doubleword in R7. 298 CMPU R8,R7 299 BNE CR7,done 300 BEQ notfound // Hit length. 301 302 MOVDU 8(R8),R12 303 CMPB R12,R5,R3 304 CMPU R3,$0,CR6 305 CMPU R8,R7 306 BNE CR6,done 307 BEQ notfound 308 309 MOVDU 8(R8),R12 310 CMPB R12,R5,R3 311 CMPU R3,$0,CR6 312 CMPU R8,R7 313 BNE CR6,done 314 BEQ notfound 315 316 MOVDU 8(R8),R12 317 CMPB R12,R5,R3 318 CMPU R3,$0,CR6 319 CMPU R8,R7 320 BNE CR6,done 321 BEQ notfound 322 323 MOVDU 8(R8),R12 324 CMPB R12,R5,R3 325 CMPU R3,$0,CR6 326 BNE CR6,done 327 BR notfound 328