github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/indexbyte_ppc64x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ppc64 || ppc64le 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 11 // R3 = byte array pointer 12 // R4 = length 13 MOVD R6, R5 // R5 = byte 14 BR indexbytebody<>(SB) 15 16 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32 17 // R3 = string 18 // R4 = length 19 // R5 = byte 20 BR indexbytebody<>(SB) 21 22 #ifndef GOPPC64_power9 23 #ifdef GOARCH_ppc64le 24 DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800 25 DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840 26 #else 27 DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038 28 DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078 29 #endif 30 GLOBL indexbytevbperm<>+0(SB), RODATA, $16 31 #endif 32 33 // Some operations are endian specific, choose the correct opcode base on GOARCH. 34 // Note, _VCZBEBB is only available on power9 and newer. 35 #ifdef GOARCH_ppc64le 36 #define _LDBEX MOVDBR 37 #define _LWBEX MOVWBR 38 #define _LHBEX MOVHBR 39 #define _VCZBEBB VCTZLSBB 40 #else 41 #define _LDBEX MOVD 42 #define _LWBEX MOVW 43 #define _LHBEX MOVH 44 #define _VCZBEBB VCLZLSBB 45 #endif 46 47 // R3 = addr of string 48 // R4 = len of string 49 // R5 = byte to find 50 // On exit: 51 // R3 = return value 52 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 53 CMPU R4,$32 54 55 #ifndef GOPPC64_power9 56 // Load VBPERMQ constant to reduce compare into an ordered bit mask. 57 MOVD $indexbytevbperm<>+00(SB),R16 58 LXVD2X (R16),V0 // Set up swap string 59 #endif 60 61 MTVRD R5,V1 62 VSPLTB $7,V1,V1 // Replicate byte across V1 63 64 BLT cmp16 // Jump to the small string case if it's <32 bytes. 65 66 CMP R4,$64,CR1 67 MOVD $16,R11 68 MOVD R3,R8 69 BLT CR1,cmp32 // Special case for length 32 - 63 70 MOVD $32,R12 71 MOVD $48,R6 72 73 RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63 74 ADD R3,R9,R9 // R9 = &s[len &^ 63] 75 ANDCC $63,R4 // (len &= 63) cmp 0. 76 77 PCALIGN $16 78 loop64: 79 LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0] 80 VCMPEQUBCC V2,V1,V6 81 BNE CR6,foundat0 // Match found at R8, jump out 82 83 LXVD2X (R11)(R8),V2 84 VCMPEQUBCC V2,V1,V6 85 BNE CR6,foundat1 // Match found at R8+16 bytes, jump out 86 87 LXVD2X (R12)(R8),V2 88 VCMPEQUBCC V2,V1,V6 89 BNE CR6,foundat2 // Match found at R8+32 bytes, jump out 90 91 LXVD2X (R6)(R8),V2 92 VCMPEQUBCC V2,V1,V6 93 BNE CR6,foundat3 // Match found at R8+48 bytes, jump out 94 95 ADD $64,R8 96 CMPU R8,R9,CR1 97 BNE CR1,loop64 // R8 != &s[len &^ 63]? 98 99 PCALIGN $32 100 BEQ notfound // Is tail length 0? CR0 is set before entering loop64. 101 102 CMP R4,$32 // Tail length >= 32, use cmp32 path. 103 CMP R4,$16,CR1 104 BGE cmp32 105 106 ADD R8,R4,R9 107 ADD $-16,R9 108 BLE CR1,cmp64_tail_gt0 109 110 cmp64_tail_gt16: // Tail length 17 - 32 111 LXVD2X (R0)(R8),V2 112 VCMPEQUBCC V2,V1,V6 113 BNE CR6,foundat0 114 115 cmp64_tail_gt0: // Tail length 1 - 16 116 MOVD R9,R8 117 LXVD2X (R0)(R9),V2 118 VCMPEQUBCC V2,V1,V6 119 BNE CR6,foundat0 120 121 BR notfound 122 123 cmp32: // Length 32 - 63 124 125 // Bytes 0 - 15 126 LXVD2X (R0)(R8),V2 127 VCMPEQUBCC V2,V1,V6 128 BNE CR6,foundat0 129 130 // Bytes 16 - 31 131 LXVD2X (R8)(R11),V2 132 VCMPEQUBCC V2,V1,V6 133 BNE CR6,foundat1 // Match found at R8+16 bytes, jump out 134 135 BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32) 136 CMP R4,$48 137 138 ADD R4,R8,R9 // Compute &s[len(s)-16] 139 ADD $32,R8,R8 140 ADD $-16,R9,R9 141 ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8 142 143 // Bytes 33 - 47 144 LXVD2X (R0)(R8),V2 145 VCMPEQUBCC V2,V1,V6 146 BNE CR6,foundat0 // match found at R8+32 bytes, jump out 147 148 BLE notfound 149 150 // Bytes 48 - 63 151 MOVD R9,R8 // R9 holds the final check. 152 LXVD2X (R0)(R9),V2 153 VCMPEQUBCC V2,V1,V6 154 BNE CR6,foundat0 // Match found at R8+48 bytes, jump out 155 156 BR notfound 157 158 // If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW. 159 #ifndef GOPPC64_power9 160 #define ADJUST_FOR_CNTLZW -16 161 #else 162 #define ADJUST_FOR_CNTLZW 0 163 #endif 164 165 // Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used 166 // to determine the offset into the 16B vector, it will overcount by 16. Account for it here. 167 foundat3: 168 SUB R3,R8,R3 169 ADD $48+ADJUST_FOR_CNTLZW,R3 170 BR vfound 171 foundat2: 172 SUB R3,R8,R3 173 ADD $32+ADJUST_FOR_CNTLZW,R3 174 BR vfound 175 foundat1: 176 SUB R3,R8,R3 177 ADD $16+ADJUST_FOR_CNTLZW,R3 178 BR vfound 179 foundat0: 180 SUB R3,R8,R3 181 ADD $0+ADJUST_FOR_CNTLZW,R3 182 vfound: 183 // Map equal values into a 16 bit value with earlier matches setting higher bits. 184 #ifndef GOPPC64_power9 185 VBPERMQ V6,V0,V6 186 MFVRD V6,R4 187 CNTLZW R4,R4 188 #else 189 #ifdef GOARCH_ppc64le 190 // Put the value back into LE ordering by swapping doublewords. 191 XXPERMDI V6,V6,$2,V6 192 #endif 193 _VCZBEBB V6,R4 194 #endif 195 ADD R3,R4,R3 196 RET 197 198 cmp16: // Length 16 - 31 199 CMPU R4,$16 200 ADD R4,R3,R9 201 BLT cmp8 202 203 ADD $-16,R9,R9 // &s[len(s)-16] 204 205 // Bytes 0 - 15 206 LXVD2X (R0)(R3),V2 207 VCMPEQUBCC V2,V1,V6 208 MOVD R3,R8 209 BNE CR6,foundat0 // Match found at R8+32 bytes, jump out 210 211 BEQ notfound 212 213 // Bytes 16 - 30 214 MOVD R9,R8 // R9 holds the final check. 215 LXVD2X (R0)(R9),V2 216 VCMPEQUBCC V2,V1,V6 217 BNE CR6,foundat0 // Match found at R8+48 bytes, jump out 218 219 BR notfound 220 221 222 cmp8: // Length 8 - 15 223 #ifdef GOPPC64_power10 224 // Load all the bytes into a single VSR in BE order. 225 SLD $56,R4,R5 226 LXVLL R3,R5,V2 227 // Compare and count the number which don't match. 228 VCMPEQUB V2,V1,V6 229 VCLZLSBB V6,R3 230 // If count is the number of bytes, or more. No matches are found. 231 CMPU R3,R4 232 MOVD $-1,R5 233 // Otherwise, the count is the index of the first match. 234 ISEL CR0LT,R3,R5,R3 235 RET 236 #else 237 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. 238 RLDIMI $16,R5,$32,R5 239 RLDIMI $32,R5,$0,R5 240 CMPU R4,$8 241 BLT cmp4 242 MOVD $-8,R11 243 ADD $-8,R4,R4 244 245 _LDBEX (R0)(R3),R10 246 _LDBEX (R11)(R9),R11 247 CMPB R10,R5,R10 248 CMPB R11,R5,R11 249 CMPU R10,$0 250 CMPU R11,$0,CR1 251 CNTLZD R10,R10 252 CNTLZD R11,R11 253 SRD $3,R10,R3 254 SRD $3,R11,R11 255 BNE found 256 257 ADD R4,R11,R4 258 MOVD $-1,R3 259 ISEL CR1EQ,R3,R4,R3 260 RET 261 262 cmp4: // Length 4 - 7 263 CMPU R4,$4 264 BLT cmp2 265 MOVD $-4,R11 266 ADD $-4,R4,R4 267 268 _LWBEX (R0)(R3),R10 269 _LWBEX (R11)(R9),R11 270 CMPB R10,R5,R10 271 CMPB R11,R5,R11 272 CNTLZW R10,R10 273 CNTLZW R11,R11 274 CMPU R10,$32 275 CMPU R11,$32,CR1 276 SRD $3,R10,R3 277 SRD $3,R11,R11 278 BNE found 279 280 ADD R4,R11,R4 281 MOVD $-1,R3 282 ISEL CR1EQ,R3,R4,R3 283 RET 284 285 cmp2: // Length 2 - 3 286 CMPU R4,$2 287 BLT cmp1 288 289 _LHBEX (R0)(R3),R10 290 CMPB R10,R5,R10 291 SLDCC $48,R10,R10 292 CNTLZD R10,R10 293 SRD $3,R10,R3 294 BNE found 295 296 cmp1: // Length 1 297 MOVD $-1,R3 298 ANDCC $1,R4,R31 299 BEQ found 300 301 MOVBZ -1(R9),R10 302 CMPB R10,R5,R10 303 ANDCC $1,R10 304 ADD $-1,R4 305 ISEL CR0EQ,R3,R4,R3 306 307 found: 308 RET 309 #endif 310 311 notfound: 312 MOVD $-1,R3 313 RET 314