github.com/primecitizens/pcz/std@v0.2.1/core/cmp/bs_ppc64x.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && (ppc64 || ppc64le) 9 10 #include "textflag.h" 11 12 TEXT ·Bytes<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 13 // incoming: 14 // R3 a addr -> R5 15 // R4 a len -> R3 16 // R5 a cap unused 17 // R6 b addr -> R6 18 // R7 b len -> R4 19 // R8 b cap unused 20 MOVD R3, R5 21 MOVD R4, R3 22 MOVD R7, R4 23 CMP R5,R6,CR7 24 CMP R3,R4,CR6 25 BEQ CR7,equal 26 MOVBZ ·isPOWER9(SB), R16 27 CMP R16,$1 28 BNE power8 29 BR cmpbodyp9<>(SB) 30 power8: 31 BR cmpbody<>(SB) 32 equal: 33 BEQ CR6,done 34 MOVD $1, R8 35 BGT CR6,greater 36 NEG R8 37 greater: 38 MOVD R8, R3 39 RET 40 done: 41 MOVD $0, R3 42 RET 43 44 TEXT ·String<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 45 // incoming: 46 // R3 a addr -> R5 47 // R4 a len -> R3 48 // R5 b addr -> R6 49 // R6 b len -> R4 50 MOVD R6, R7 51 MOVD R5, R6 52 MOVD R3, R5 53 MOVD R4, R3 54 MOVD R7, R4 55 CMP R5,R6,CR7 56 CMP R3,R4,CR6 57 BEQ CR7,equal 58 MOVBZ ·isPOWER9(SB), R16 59 CMP R16,$1 60 BNE power8 61 BR cmpbodyp9<>(SB) 62 power8: 63 BR cmpbody<>(SB) 64 equal: 65 BEQ CR6,done 66 MOVD $1, R8 67 BGT CR6,greater 68 NEG R8 69 greater: 70 MOVD R8, R3 71 RET 72 73 done: 74 MOVD $0, R3 75 RET 76 77 #ifdef GOARCH_ppc64le 78 DATA byteswap<>+0(SB)/8, $0x0706050403020100 79 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 80 GLOBL byteswap<>+0(SB), RODATA, $16 81 #define SWAP V21 82 #endif 83 84 // Do an efficient memcmp for ppc64le/ppc64/POWER8 85 // R3 = a len 86 // R4 = b len 87 // R5 = a addr 88 // R6 = b addr 89 // On exit: 90 // R3 = return value 91 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 92 MOVD R3,R8 // set up length 93 CMP R3,R4,CR2 // unequal? 94 BLT CR2,setuplen // BLT CR2 95 MOVD R4,R8 // use R4 for comparison len 96 setuplen: 97 CMP R8,$32 // optimize >= 32 98 MOVD R8,R9 99 BLT setup8a // optimize < 32 100 MOVD $16,R10 // set offsets to load into vectors 101 CMP R8,$64 102 BLT cmp32 // process size 32-63 103 104 DCBT (R5) // optimize >= 64 105 DCBT (R6) // cache hint 106 MOVD $32,R11 // set offsets to load into vector 107 MOVD $48,R12 // set offsets to load into vector 108 109 loop64a:// process size 64 and greater 110 LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector 111 LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector 112 VCMPEQUDCC V3,V4,V1 113 BGE CR6,different // jump out if its different 114 115 LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector 116 LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector 117 118 VCMPEQUDCC V3,V4,V1 119 BGE CR6,different 120 121 LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector 122 LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector 123 124 VCMPEQUDCC V3,V4,V1 125 BGE CR6,different 126 127 LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector 128 LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector 129 130 VCMPEQUDCC V3,V4,V1 131 BGE CR6,different 132 133 ADD $-64,R9,R9 // reduce remaining size by 64 134 ADD $64,R5,R5 // increment to next 64 bytes of A 135 ADD $64,R6,R6 // increment to next 64 bytes of B 136 CMPU R9,$64 137 BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining 138 139 CMPU R9,$32 140 BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining 141 CMPU R9,$0 142 BNE rem // loop to rem if the remainder is not 0 143 144 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 145 BLT CR2,less // jump to less if len(A)<len(B) 146 BR greater // jump to greater otherwise 147 cmp32: 148 LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector 149 LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector 150 151 VCMPEQUDCC V3,V4,V1 152 BGE CR6,different 153 154 LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector 155 LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector 156 157 VCMPEQUDCC V3,V4,V1 158 BGE CR6,different 159 160 ADD $-32,R9,R9 // reduce remaining size by 32 161 ADD $32,R5,R5 // increment to next 32 bytes of A 162 ADD $32,R6,R6 // increment to next 32 bytes of B 163 CMPU R9,$0 164 BNE rem // loop to rem if the remainder is not 0 165 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 166 BLT CR2,less // jump to less if len(A)<len(B) 167 BR greater // jump to greater otherwise 168 rem: 169 MOVD R9,R8 170 ANDCC $24,R8,R9 // Any 8 byte chunks? 171 BEQ leftover // and result is 0 172 BR setup8a 173 174 different: 175 #ifdef GOARCH_ppc64le 176 MOVD $byteswap<>+00(SB), R16 177 LXVD2X (R16)(R0),SWAP // Set up swap string 178 179 VPERM V3,V3,SWAP,V3 180 VPERM V4,V4,SWAP,V4 181 #endif 182 MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison 183 MFVSRD VS36,R10 184 185 CMPU R16,R10 186 BEQ lower 187 BGT greater 188 MOVD $-1,R3 // return value if A < B 189 RET 190 lower: 191 VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison 192 MFVSRD VS35,R16 193 VSLDOI $8,V4,V4,V4 194 MFVSRD VS36,R10 195 196 CMPU R16,R10 197 BGT greater 198 MOVD $-1,R3 // return value if A < B 199 RET 200 setup8a: 201 SRADCC $3,R8,R9 // get the 8 byte count 202 BEQ leftover // shifted value is 0 203 CMPU R8,$8 // optimize 8byte move 204 BEQ size8 205 CMPU R8,$16 206 BEQ size16 207 MOVD R9,CTR // loop count for doublewords 208 loop8: 209 #ifdef GOARCH_ppc64le 210 MOVDBR (R5+R0),R16 // doublewords to compare 211 MOVDBR (R6+R0),R10 // LE compare order 212 #else 213 MOVD (R5+R0),R16 // doublewords to compare 214 MOVD (R6+R0),R10 // BE compare order 215 #endif 216 ADD $8,R5 217 ADD $8,R6 218 CMPU R16,R10 // match? 219 BC 8,2,loop8 // bt ctr <> 0 && cr 220 BGT greater 221 BLT less 222 leftover: 223 ANDCC $7,R8,R9 // check for leftover bytes 224 BEQ zeroremainder 225 simplecheck: 226 MOVD R0,R14 227 CMP R9,$4 // process 4 bytes 228 BLT halfword 229 #ifdef GOARCH_ppc64le 230 MOVWBR (R5)(R14),R10 231 MOVWBR (R6)(R14),R11 232 #else 233 MOVWZ (R5)(R14),R10 234 MOVWZ (R6)(R14),R11 235 #endif 236 CMPU R10,R11 237 BGT greater 238 BLT less 239 ADD $-4,R9 240 ADD $4,R14 241 PCALIGN $16 242 243 halfword: 244 CMP R9,$2 // process 2 bytes 245 BLT byte 246 #ifdef GOARCH_ppc64le 247 MOVHBR (R5)(R14),R10 248 MOVHBR (R6)(R14),R11 249 #else 250 MOVHZ (R5)(R14),R10 251 MOVHZ (R6)(R14),R11 252 #endif 253 CMPU R10,R11 254 BGT greater 255 BLT less 256 ADD $-2,R9 257 ADD $2,R14 258 PCALIGN $16 259 byte: 260 CMP R9,$0 // process 1 byte 261 BEQ skip 262 MOVBZ (R5)(R14),R10 263 MOVBZ (R6)(R14),R11 264 CMPU R10,R11 265 BGT greater 266 BLT less 267 PCALIGN $16 268 skip: 269 BEQ CR2,equal 270 BGT CR2,greater 271 272 less: MOVD $-1,R3 // return value if A < B 273 RET 274 size16: 275 LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector 276 LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector 277 VCMPEQUDCC V3,V4,V1 278 BGE CR6,different 279 zeroremainder: 280 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 281 BLT CR2,less // jump to less if len(A)<len(B) 282 BR greater // jump to greater otherwise 283 size8: 284 #ifdef GOARCH_ppc64le 285 MOVDBR (R5+R0),R16 // doublewords to compare 286 MOVDBR (R6+R0),R10 // LE compare order 287 #else 288 MOVD (R5+R0),R16 // doublewords to compare 289 MOVD (R6+R0),R10 // BE compare order 290 #endif 291 CMPU R16,R10 // match? 292 BGT greater 293 BLT less 294 BGT CR2,greater // 2nd len > 1st len 295 BLT CR2,less // 2nd len < 1st len 296 equal: 297 MOVD $0, R3 // return value if A == B 298 RET 299 greater: 300 MOVD $1,R3 // return value if A > B 301 RET 302 303 // Do an efficient memcmp for ppc64le/ppc64/POWER9 304 // R3 = a len 305 // R4 = b len 306 // R5 = a addr 307 // R6 = b addr 308 // On exit: 309 // R3 = return value 310 TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 311 MOVD R3,R8 // set up length 312 CMP R3,R4,CR2 // unequal? 313 BLT CR2,setuplen // BLT CR2 314 MOVD R4,R8 // use R4 for comparison len 315 setuplen: 316 CMP R8,$16 // optimize for size<16 317 MOVD R8,R9 318 BLT simplecheck 319 MOVD $16,R10 // set offsets to load into vectors 320 CMP R8,$32 // optimize for size 16-31 321 BLT cmp16 322 CMP R8,$64 323 BLT cmp32 // optimize for size 32-63 324 DCBT (R5) // optimize for size>=64 325 DCBT (R6) // cache hint 326 327 MOVD $32,R11 // set offsets to load into vector 328 MOVD $48,R12 // set offsets to load into vector 329 330 loop64a:// process size 64 and greater 331 LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector 332 LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector 333 VCMPNEBCC V3,V4,V1 // record comparison into V1 334 BNE CR6,different // jump out if its different 335 336 LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector 337 LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector 338 VCMPNEBCC V3,V4,V1 339 BNE CR6,different 340 341 LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector 342 LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector 343 VCMPNEBCC V3,V4,V1 344 BNE CR6,different 345 346 LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector 347 LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector 348 VCMPNEBCC V3,V4,V1 349 BNE CR6,different 350 351 ADD $-64,R9,R9 // reduce remaining size by 64 352 ADD $64,R5,R5 // increment to next 64 bytes of A 353 ADD $64,R6,R6 // increment to next 64 bytes of B 354 CMPU R9,$64 355 BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining 356 357 CMPU R9,$32 358 BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining 359 CMPU R9,$16 360 BGE cmp16 // loop to cmp16 if there are 16-31 bytes left 361 CMPU R9,$0 362 BNE simplecheck // loop to simplecheck for remaining bytes 363 364 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 365 BLT CR2,less // jump to less if len(A)<len(B) 366 BR greater // jump to greater otherwise 367 cmp32: 368 LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector 369 LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector 370 371 VCMPNEBCC V3,V4,V1 // record comparison into V1 372 BNE CR6,different // jump out if its different 373 374 LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector 375 LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector 376 VCMPNEBCC V3,V4,V1 377 BNE CR6,different 378 379 ADD $-32,R9,R9 // reduce remaining size by 32 380 ADD $32,R5,R5 // increment to next 32 bytes of A 381 ADD $32,R6,R6 // increment to next 32 bytes of B 382 CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left 383 BGE cmp16 384 CMPU R9,$0 385 BNE simplecheck // loop to simplecheck for remainder bytes 386 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 387 BLT CR2,less // jump to less if len(A)<len(B) 388 BR greater // jump to greater otherwise 389 different: 390 391 MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison 392 MFVSRD VS36,R10 393 394 CMPU R16,R10 395 BEQ lower 396 BGT greater 397 MOVD $-1,R3 // return value if A < B 398 RET 399 lower: 400 MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison 401 MFVSRLD VS36,R10 402 403 CMPU R16,R10 404 BGT greater 405 MOVD $-1,R3 // return value if A < B 406 RET 407 408 greater: 409 MOVD $1,R3 // return value if A > B 410 RET 411 cmp16: 412 ANDCC $16,R9,R31 413 BEQ tail 414 415 LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector 416 LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector 417 VCMPEQUDCC V3,V4,V1 418 BGE CR6,different 419 420 ADD $16,R5 421 ADD $16,R6 422 tail: 423 ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) 424 BEQ end 425 426 ADD R9,R5 427 ADD R9,R6 428 MOVD $-16,R10 429 430 LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector 431 LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector 432 VCMPEQUDCC V3,V4,V1 433 BGE CR6,different 434 end: 435 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 436 BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B) 437 BR greater // jump to greater otherwise 438 simplecheck: 439 MOVD $0,R14 // process 8 bytes 440 CMP R9,$8 441 BLT word 442 #ifdef GOARCH_ppc64le 443 MOVDBR (R5+R14),R10 444 MOVDBR (R6+R14),R11 445 #else 446 MOVD (R5+R14),R10 447 MOVD (R6+R14),R11 448 #endif 449 CMPU R10,R11 450 BGT greater 451 BLT less 452 ADD $8,R14 453 ADD $-8,R9 454 PCALIGN $16 455 word: 456 CMP R9,$4 // process 4 bytes 457 BLT halfword 458 #ifdef GOARCH_ppc64le 459 MOVWBR (R5+R14),R10 460 MOVWBR (R6+R14),R11 461 #else 462 MOVWZ (R5+R14),R10 463 MOVWZ (R6+R14),R11 464 #endif 465 CMPU R10,R11 466 BGT greater 467 BLT less 468 ADD $4,R14 469 ADD $-4,R9 470 PCALIGN $16 471 halfword: 472 CMP R9,$2 // process 2 bytes 473 BLT byte 474 #ifdef GOARCH_ppc64le 475 MOVHBR (R5+R14),R10 476 MOVHBR (R6+R14),R11 477 #else 478 MOVHZ (R5+R14),R10 479 MOVHZ (R6+R14),R11 480 #endif 481 CMPU R10,R11 482 BGT greater 483 BLT less 484 ADD $2,R14 485 ADD $-2,R9 486 PCALIGN $16 487 byte: 488 CMP R9,$0 // process 1 byte 489 BEQ skip 490 MOVBZ (R5+R14),R10 491 MOVBZ (R6+R14),R11 492 CMPU R10,R11 493 BGT greater 494 BLT less 495 PCALIGN $16 496 skip: 497 BEQ CR2,equal 498 BGT CR2,greater 499 less: 500 MOVD $-1,R3 // return value if A < B 501 RET 502 equal: 503 MOVD $0, R3 // return value if A == B 504 RET