github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/internal/bytealg/indexbyte_ppc64x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ppc64 || ppc64le 6 // +build ppc64 ppc64le 7 8 #include "go_asm.h" 9 #include "textflag.h" 10 11 TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 12 MOVD b_base+0(FP), R3 // R3 = byte array pointer 13 MOVD b_len+8(FP), R4 // R4 = length 14 MOVBZ c+24(FP), R5 // R5 = byte 15 MOVD $ret+32(FP), R14 // R14 = &ret 16 BR indexbytebody<>(SB) 17 18 TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 19 MOVD s_base+0(FP), R3 // R3 = string 20 MOVD s_len+8(FP), R4 // R4 = length 21 MOVBZ c+16(FP), R5 // R5 = byte 22 MOVD $ret+24(FP), R14 // R14 = &ret 23 BR indexbytebody<>(SB) 24 25 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 26 MOVD R3,R17 // Save base address for calculating the index later. 27 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. 28 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. 29 ADD R4,R3,R7 // Last acceptable address in R7. 30 DCBT (R8) // Prepare cache line. 31 32 RLDIMI $16,R5,$32,R5 33 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. 34 MOVD $-1,R9 35 WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). 36 RLDIMI $32,R5,$0,R5 37 MOVD R7,R10 // Save last acceptable address in R10 for later. 38 ADD $-1,R7,R7 39 #ifdef GOARCH_ppc64le 40 SLD R6,R9,R9 // Prepare mask for Little Endian 41 #else 42 SRD R6,R9,R9 // Same for Big Endian 43 #endif 44 BLE small_string // Jump to the small string case if it's ≤32 bytes. 45 46 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values 47 // in V0, V1 and V10, then branch to the preloop. 48 ANDCC $63,R3,R11 49 BEQ CR0,qw_align 50 RLDICL $0,R3,$61,R11 51 52 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 53 CMPB R12,R5,R3 // Check for a match. 54 AND R9,R3,R3 // Mask bytes below s_base 55 RLDICL $0,R7,$61,R6 // length-1 56 RLDICR $0,R7,$60,R7 // Last doubleword in R7 57 CMPU R3,$0,CR7 // If we have a match, jump to the final computation 58 BNE CR7,done 59 ADD $8,R8,R8 60 ADD $-8,R4,R4 61 ADD R4,R11,R4 62 63 // Check for quadword alignment 64 ANDCC $15,R8,R11 65 BEQ CR0,qw_align 66 67 // Not aligned, so handle the next doubleword 68 MOVD 0(R8),R12 69 CMPB R12,R5,R3 70 CMPU R3,$0,CR7 71 BNE CR7,done 72 ADD $8,R8,R8 73 ADD $-8,R4,R4 74 75 // Either quadword aligned or 64-byte at this point. We can use LVX. 76 qw_align: 77 78 // Set up auxiliary data for the vectorized algorithm. 79 VSPLTISB $0,V0 // Replicate 0 across V0 80 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ 81 MTVRD R5,V1 82 LVSL (R0+R0),V11 83 VSLB V11,V10,V10 84 VSPLTB $7,V1,V1 // Replicate byte across V1 85 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop 86 BLE tail 87 88 // We will load 4 quardwords per iteration in the loop, so check for 89 // 64-byte alignment. If 64-byte aligned, then branch to the preloop. 90 ANDCC $63,R8,R11 91 BEQ CR0,preloop 92 93 // Not 64-byte aligned. Load one quadword at a time until aligned. 94 LVX (R8+R0),V4 95 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 96 BNE CR6,found_qw_align 97 ADD $16,R8,R8 98 ADD $-16,R4,R4 99 100 ANDCC $63,R8,R11 101 BEQ CR0,preloop 102 LVX (R8+R0),V4 103 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 104 BNE CR6,found_qw_align 105 ADD $16,R8,R8 106 ADD $-16,R4,R4 107 108 ANDCC $63,R8,R11 109 BEQ CR0,preloop 110 LVX (R8+R0),V4 111 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 112 BNE CR6,found_qw_align 113 ADD $-16,R4,R4 114 ADD $16,R8,R8 115 116 // 64-byte aligned. Prepare for the main loop. 117 preloop: 118 CMPU R4,$64 119 BLE tail // If len ≤ 64, don't use the vectorized loop 120 121 // We are now aligned to a 64-byte boundary. We will load 4 quadwords 122 // per loop iteration. The last doubleword is in R10, so our loop counter 123 // starts at (R10-R8)/64. 124 SUB R8,R10,R6 125 SRD $6,R6,R9 // Loop counter in R9 126 MOVD R9,CTR 127 128 ADD $-64,R8,R8 // Adjust index for loop entry 129 MOVD $16,R11 // Load offsets for the vector loads 130 MOVD $32,R9 131 MOVD $48,R7 132 133 // Main loop we will load 64 bytes per iteration 134 loop: 135 ADD $64,R8,R8 // Fuse addi+lvx for performance 136 LVX (R8+R0),V2 // Load 4 16-byte vectors 137 LVX (R8+R11),V3 138 VCMPEQUB V1,V2,V6 // Look for byte in each vector 139 VCMPEQUB V1,V3,V7 140 141 LVX (R8+R9),V4 142 LVX (R8+R7),V5 143 VCMPEQUB V1,V4,V8 144 VCMPEQUB V1,V5,V9 145 146 VOR V6,V7,V11 // Compress the result in a single vector 147 VOR V8,V9,V12 148 VOR V11,V12,V13 149 VCMPEQUBCC V0,V13,V14 // Check for byte 150 BGE CR6,found 151 BC 16,0,loop // bdnz loop 152 153 // Handle the tailing bytes or R4 ≤ 64 154 RLDICL $0,R6,$58,R4 155 ADD $64,R8,R8 156 tail: 157 CMPU R4,$0 158 BEQ notfound 159 LVX (R8+R0),V4 160 VCMPEQUBCC V1,V4,V6 161 BNE CR6,found_qw_align 162 ADD $16,R8,R8 163 CMPU R4,$16,CR6 164 BLE CR6,notfound 165 ADD $-16,R4,R4 166 167 LVX (R8+R0),V4 168 VCMPEQUBCC V1,V4,V6 169 BNE CR6,found_qw_align 170 ADD $16,R8,R8 171 CMPU R4,$16,CR6 172 BLE CR6,notfound 173 ADD $-16,R4,R4 174 175 LVX (R8+R0),V4 176 VCMPEQUBCC V1,V4,V6 177 BNE CR6,found_qw_align 178 ADD $16,R8,R8 179 CMPU R4,$16,CR6 180 BLE CR6,notfound 181 ADD $-16,R4,R4 182 183 LVX (R8+R0),V4 184 VCMPEQUBCC V1,V4,V6 185 BNE CR6,found_qw_align 186 187 notfound: 188 MOVD $-1,R3 189 MOVD R3,(R14) 190 RET 191 192 found: 193 // We will now compress the results into a single doubleword, 194 // so it can be moved to a GPR for the final index calculation. 195 196 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the 197 // first bit of each byte into bits 48-63. 198 VBPERMQ V6,V10,V6 199 VBPERMQ V7,V10,V7 200 VBPERMQ V8,V10,V8 201 VBPERMQ V9,V10,V9 202 203 // Shift each 16-bit component into its correct position for 204 // merging into a single doubleword. 205 #ifdef GOARCH_ppc64le 206 VSLDOI $2,V7,V7,V7 207 VSLDOI $4,V8,V8,V8 208 VSLDOI $6,V9,V9,V9 209 #else 210 VSLDOI $6,V6,V6,V6 211 VSLDOI $4,V7,V7,V7 212 VSLDOI $2,V8,V8,V8 213 #endif 214 215 // Merge V6-V9 into a single doubleword and move to a GPR. 216 VOR V6,V7,V11 217 VOR V8,V9,V4 218 VOR V4,V11,V4 219 MFVRD V4,R3 220 221 #ifdef GOARCH_ppc64le 222 ADD $-1,R3,R11 223 ANDN R3,R11,R11 224 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 225 #else 226 CNTLZD R3,R11 // Count leading zeros (Big Endian). 227 #endif 228 ADD R8,R11,R3 // Calculate byte address 229 230 return: 231 SUB R17,R3 232 MOVD R3,(R14) 233 RET 234 235 found_qw_align: 236 // Use the same algorithm as above. Compress the result into 237 // a single doubleword and move it to a GPR for the final 238 // calculation. 239 VBPERMQ V6,V10,V6 240 241 #ifdef GOARCH_ppc64le 242 MFVRD V6,R3 243 ADD $-1,R3,R11 244 ANDN R3,R11,R11 245 POPCNTD R11,R11 246 #else 247 VSLDOI $6,V6,V6,V6 248 MFVRD V6,R3 249 CNTLZD R3,R11 250 #endif 251 ADD R8,R11,R3 252 CMPU R11,R4 253 BLT return 254 BR notfound 255 256 done: 257 // At this point, R3 has 0xFF in the same position as the byte we are 258 // looking for in the doubleword. Use that to calculate the exact index 259 // of the byte. 260 #ifdef GOARCH_ppc64le 261 ADD $-1,R3,R11 262 ANDN R3,R11,R11 263 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 264 #else 265 CNTLZD R3,R11 // Count leading zeros (Big Endian). 266 #endif 267 CMPU R8,R7 // Check if we are at the last doubleword. 268 SRD $3,R11 // Convert trailing zeros to bytes. 269 ADD R11,R8,R3 270 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. 271 BNE return 272 BLE CR7,return 273 BR notfound 274 275 small_string: 276 // We unroll this loop for better performance. 277 CMPU R4,$0 // Check for length=0 278 BEQ notfound 279 280 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 281 CMPB R12,R5,R3 // Check for a match. 282 AND R9,R3,R3 // Mask bytes below s_base. 283 CMPU R3,$0,CR7 // If we have a match, jump to the final computation. 284 RLDICL $0,R7,$61,R6 // length-1 285 RLDICR $0,R7,$60,R7 // Last doubleword in R7. 286 CMPU R8,R7 287 BNE CR7,done 288 BEQ notfound // Hit length. 289 290 MOVDU 8(R8),R12 291 CMPB R12,R5,R3 292 CMPU R3,$0,CR6 293 CMPU R8,R7 294 BNE CR6,done 295 BEQ notfound 296 297 MOVDU 8(R8),R12 298 CMPB R12,R5,R3 299 CMPU R3,$0,CR6 300 CMPU R8,R7 301 BNE CR6,done 302 BEQ notfound 303 304 MOVDU 8(R8),R12 305 CMPB R12,R5,R3 306 CMPU R3,$0,CR6 307 CMPU R8,R7 308 BNE CR6,done 309 BEQ notfound 310 311 MOVDU 8(R8),R12 312 CMPB R12,R5,R3 313 CMPU R3,$0,CR6 314 BNE CR6,done 315 BR notfound 316