github.com/hikaru7719/go@v0.0.0-20181025140707-c8b2ac68906a/src/internal/bytealg/indexbyte_ppc64x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build ppc64 ppc64le 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 11 MOVD b_base+0(FP), R3 // R3 = byte array pointer 12 MOVD b_len+8(FP), R4 // R4 = length 13 MOVBZ c+24(FP), R5 // R5 = byte 14 MOVD $ret+32(FP), R14 // R14 = &ret 15 BR indexbytebody<>(SB) 16 17 TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 18 MOVD s_base+0(FP), R3 // R3 = string 19 MOVD s_len+8(FP), R4 // R4 = length 20 MOVBZ c+16(FP), R5 // R5 = byte 21 MOVD $ret+24(FP), R14 // R14 = &ret 22 BR indexbytebody<>(SB) 23 24 TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 25 FUNCDATA $0, ·IndexByte·args_stackmap(SB) 26 MOVD b_base+0(FP), R3 // R3 = byte array pointer 27 MOVD b_len+8(FP), R4 // R4 = length 28 MOVBZ c+24(FP), R5 // R5 = byte 29 MOVD $ret+32(FP), R14 // R14 = &ret 30 BR indexbytebody<>(SB) 31 32 TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32 33 FUNCDATA $0, ·IndexByteString·args_stackmap(SB) 34 MOVD s_base+0(FP), R3 // R3 = string 35 MOVD s_len+8(FP), R4 // R4 = length 36 MOVBZ c+16(FP), R5 // R5 = byte 37 MOVD $ret+24(FP), R14 // R14 = &ret 38 BR indexbytebody<>(SB) 39 40 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 41 MOVD R3,R17 // Save base address for calculating the index later. 42 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. 43 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. 44 ADD R4,R3,R7 // Last acceptable address in R7. 45 DCBT (R8) // Prepare cache line. 46 47 RLDIMI $16,R5,$32,R5 48 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. 49 MOVD $-1,R9 50 WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). 51 RLDIMI $32,R5,$0,R5 52 MOVD R7,R10 // Save last acceptable address in R10 for later. 53 ADD $-1,R7,R7 54 #ifdef GOARCH_ppc64le 55 SLD R6,R9,R9 // Prepare mask for Little Endian 56 #else 57 SRD R6,R9,R9 // Same for Big Endian 58 #endif 59 BLE small_string // Jump to the small string case if it's ≤32 bytes. 60 61 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values 62 // in V0, V1 and V10, then branch to the preloop. 63 ANDCC $63,R3,R11 64 BEQ CR0,qw_align 65 RLDICL $0,R3,$61,R11 66 67 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 68 CMPB R12,R5,R3 // Check for a match. 69 AND R9,R3,R3 // Mask bytes below s_base 70 RLDICL $0,R7,$61,R6 // length-1 71 RLDICR $0,R7,$60,R7 // Last doubleword in R7 72 CMPU R3,$0,CR7 // If we have a match, jump to the final computation 73 BNE CR7,done 74 ADD $8,R8,R8 75 ADD $-8,R4,R4 76 ADD R4,R11,R4 77 78 // Check for quadword alignment 79 ANDCC $15,R8,R11 80 BEQ CR0,qw_align 81 82 // Not aligned, so handle the next doubleword 83 MOVD 0(R8),R12 84 CMPB R12,R5,R3 85 CMPU R3,$0,CR7 86 BNE CR7,done 87 ADD $8,R8,R8 88 ADD $-8,R4,R4 89 90 // Either quadword aligned or 64-byte at this point. We can use LVX. 91 qw_align: 92 93 // Set up auxiliary data for the vectorized algorithm. 94 VSPLTISB $0,V0 // Replicate 0 across V0 95 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ 96 MTVRD R5,V1 97 LVSL (R0+R0),V11 98 VSLB V11,V10,V10 99 VSPLTB $7,V1,V1 // Replicate byte across V1 100 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop 101 BLE tail 102 103 // We will load 4 quardwords per iteration in the loop, so check for 104 // 64-byte alignment. If 64-byte aligned, then branch to the preloop. 105 ANDCC $63,R8,R11 106 BEQ CR0,preloop 107 108 // Not 64-byte aligned. Load one quadword at a time until aligned. 109 LVX (R8+R0),V4 110 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 111 BNE CR6,found_qw_align 112 ADD $16,R8,R8 113 ADD $-16,R4,R4 114 115 ANDCC $63,R8,R11 116 BEQ CR0,preloop 117 LVX (R8+R0),V4 118 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 119 BNE CR6,found_qw_align 120 ADD $16,R8,R8 121 ADD $-16,R4,R4 122 123 ANDCC $63,R8,R11 124 BEQ CR0,preloop 125 LVX (R8+R0),V4 126 VCMPEQUBCC V1,V4,V6 // Check for byte in V4 127 BNE CR6,found_qw_align 128 ADD $-16,R4,R4 129 ADD $16,R8,R8 130 131 // 64-byte aligned. Prepare for the main loop. 132 preloop: 133 CMPU R4,$64 134 BLE tail // If len ≤ 64, don't use the vectorized loop 135 136 // We are now aligned to a 64-byte boundary. We will load 4 quadwords 137 // per loop iteration. The last doubleword is in R10, so our loop counter 138 // starts at (R10-R8)/64. 139 SUB R8,R10,R6 140 SRD $6,R6,R9 // Loop counter in R9 141 MOVD R9,CTR 142 143 ADD $-64,R8,R8 // Adjust index for loop entry 144 MOVD $16,R11 // Load offsets for the vector loads 145 MOVD $32,R9 146 MOVD $48,R7 147 148 // Main loop we will load 64 bytes per iteration 149 loop: 150 ADD $64,R8,R8 // Fuse addi+lvx for performance 151 LVX (R8+R0),V2 // Load 4 16-byte vectors 152 LVX (R8+R11),V3 153 VCMPEQUB V1,V2,V6 // Look for byte in each vector 154 VCMPEQUB V1,V3,V7 155 156 LVX (R8+R9),V4 157 LVX (R8+R7),V5 158 VCMPEQUB V1,V4,V8 159 VCMPEQUB V1,V5,V9 160 161 VOR V6,V7,V11 // Compress the result in a single vector 162 VOR V8,V9,V12 163 VOR V11,V12,V13 164 VCMPEQUBCC V0,V13,V14 // Check for byte 165 BGE CR6,found 166 BC 16,0,loop // bdnz loop 167 168 // Handle the tailing bytes or R4 ≤ 64 169 RLDICL $0,R6,$58,R4 170 ADD $64,R8,R8 171 tail: 172 CMPU R4,$0 173 BEQ notfound 174 LVX (R8+R0),V4 175 VCMPEQUBCC V1,V4,V6 176 BNE CR6,found_qw_align 177 ADD $16,R8,R8 178 CMPU R4,$16,CR6 179 BLE CR6,notfound 180 ADD $-16,R4,R4 181 182 LVX (R8+R0),V4 183 VCMPEQUBCC V1,V4,V6 184 BNE CR6,found_qw_align 185 ADD $16,R8,R8 186 CMPU R4,$16,CR6 187 BLE CR6,notfound 188 ADD $-16,R4,R4 189 190 LVX (R8+R0),V4 191 VCMPEQUBCC V1,V4,V6 192 BNE CR6,found_qw_align 193 ADD $16,R8,R8 194 CMPU R4,$16,CR6 195 BLE CR6,notfound 196 ADD $-16,R4,R4 197 198 LVX (R8+R0),V4 199 VCMPEQUBCC V1,V4,V6 200 BNE CR6,found_qw_align 201 202 notfound: 203 MOVD $-1,R3 204 MOVD R3,(R14) 205 RET 206 207 found: 208 // We will now compress the results into a single doubleword, 209 // so it can be moved to a GPR for the final index calculation. 210 211 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the 212 // first bit of each byte into bits 48-63. 213 VBPERMQ V6,V10,V6 214 VBPERMQ V7,V10,V7 215 VBPERMQ V8,V10,V8 216 VBPERMQ V9,V10,V9 217 218 // Shift each 16-bit component into its correct position for 219 // merging into a single doubleword. 220 #ifdef GOARCH_ppc64le 221 VSLDOI $2,V7,V7,V7 222 VSLDOI $4,V8,V8,V8 223 VSLDOI $6,V9,V9,V9 224 #else 225 VSLDOI $6,V6,V6,V6 226 VSLDOI $4,V7,V7,V7 227 VSLDOI $2,V8,V8,V8 228 #endif 229 230 // Merge V6-V9 into a single doubleword and move to a GPR. 231 VOR V6,V7,V11 232 VOR V8,V9,V4 233 VOR V4,V11,V4 234 MFVRD V4,R3 235 236 #ifdef GOARCH_ppc64le 237 ADD $-1,R3,R11 238 ANDN R3,R11,R11 239 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 240 #else 241 CNTLZD R3,R11 // Count leading zeros (Big Endian). 242 #endif 243 ADD R8,R11,R3 // Calculate byte address 244 245 return: 246 SUB R17,R3 247 MOVD R3,(R14) 248 RET 249 250 found_qw_align: 251 // Use the same algorithm as above. Compress the result into 252 // a single doubleword and move it to a GPR for the final 253 // calculation. 254 VBPERMQ V6,V10,V6 255 256 #ifdef GOARCH_ppc64le 257 MFVRD V6,R3 258 ADD $-1,R3,R11 259 ANDN R3,R11,R11 260 POPCNTD R11,R11 261 #else 262 VSLDOI $6,V6,V6,V6 263 MFVRD V6,R3 264 CNTLZD R3,R11 265 #endif 266 ADD R8,R11,R3 267 CMPU R11,R4 268 BLT return 269 BR notfound 270 271 done: 272 // At this point, R3 has 0xFF in the same position as the byte we are 273 // looking for in the doubleword. Use that to calculate the exact index 274 // of the byte. 275 #ifdef GOARCH_ppc64le 276 ADD $-1,R3,R11 277 ANDN R3,R11,R11 278 POPCNTD R11,R11 // Count trailing zeros (Little Endian). 279 #else 280 CNTLZD R3,R11 // Count leading zeros (Big Endian). 281 #endif 282 CMPU R8,R7 // Check if we are at the last doubleword. 283 SRD $3,R11 // Convert trailing zeros to bytes. 284 ADD R11,R8,R3 285 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. 286 BNE return 287 BLE CR7,return 288 BR notfound 289 290 small_string: 291 // We unroll this loop for better performance. 292 CMPU R4,$0 // Check for length=0 293 BEQ notfound 294 295 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. 296 CMPB R12,R5,R3 // Check for a match. 297 AND R9,R3,R3 // Mask bytes below s_base. 298 CMPU R3,$0,CR7 // If we have a match, jump to the final computation. 299 RLDICL $0,R7,$61,R6 // length-1 300 RLDICR $0,R7,$60,R7 // Last doubleword in R7. 301 CMPU R8,R7 302 BNE CR7,done 303 BEQ notfound // Hit length. 304 305 MOVDU 8(R8),R12 306 CMPB R12,R5,R3 307 CMPU R3,$0,CR6 308 CMPU R8,R7 309 BNE CR6,done 310 BEQ notfound 311 312 MOVDU 8(R8),R12 313 CMPB R12,R5,R3 314 CMPU R3,$0,CR6 315 CMPU R8,R7 316 BNE CR6,done 317 BEQ notfound 318 319 MOVDU 8(R8),R12 320 CMPB R12,R5,R3 321 CMPU R3,$0,CR6 322 CMPU R8,R7 323 BNE CR6,done 324 BEQ notfound 325 326 MOVDU 8(R8),R12 327 CMPB R12,R5,R3 328 CMPU R3,$0,CR6 329 BNE CR6,done 330 BR notfound 331