github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/internal/bytealg/compare_ppc64x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ppc64 || ppc64le 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 11 // incoming: 12 // R3 a addr -> R5 13 // R4 a len -> R3 14 // R5 a cap unused 15 // R6 b addr -> R6 16 // R7 b len -> R4 17 // R8 b cap unused 18 MOVD R3, R5 19 MOVD R4, R3 20 MOVD R7, R4 21 CMP R5,R6,CR7 22 CMP R3,R4,CR6 23 BEQ CR7,equal 24 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 25 CMP R16,$1 26 BNE power8 27 BR cmpbodyp9<>(SB) 28 power8: 29 BR cmpbody<>(SB) 30 equal: 31 BEQ CR6,done 32 MOVD $1, R8 33 BGT CR6,greater 34 NEG R8 35 greater: 36 MOVD R8, R3 37 RET 38 done: 39 MOVD $0, R3 40 RET 41 42 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 43 // incoming: 44 // R3 a addr -> R5 45 // R4 a len -> R3 46 // R5 b addr -> R6 47 // R6 b len -> R4 48 MOVD R6, R7 49 MOVD R5, R6 50 MOVD R3, R5 51 MOVD R4, R3 52 MOVD R7, R4 53 CMP R5,R6,CR7 54 CMP R3,R4,CR6 55 BEQ CR7,equal 56 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 57 CMP R16,$1 58 BNE power8 59 BR cmpbodyp9<>(SB) 60 power8: 61 BR cmpbody<>(SB) 62 equal: 63 BEQ CR6,done 64 MOVD $1, R8 65 BGT CR6,greater 66 NEG R8 67 greater: 68 MOVD R8, R3 69 RET 70 71 done: 72 MOVD $0, R3 73 RET 74 75 #ifdef GOARCH_ppc64le 76 DATA byteswap<>+0(SB)/8, $0x0706050403020100 77 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 78 GLOBL byteswap<>+0(SB), RODATA, $16 79 #define SWAP V21 80 #endif 81 82 // Do an efficient memcmp for ppc64le/ppc64/POWER8 83 // R3 = a len 84 // R4 = b len 85 // R5 = a addr 86 // R6 = b addr 87 // On exit: 88 // R3 = return value 89 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 90 MOVD R3,R8 // set up length 91 CMP R3,R4,CR2 // unequal? 92 BLT CR2,setuplen // BLT CR2 93 MOVD R4,R8 // use R4 for comparison len 94 setuplen: 95 CMP R8,$32 // optimize >= 32 96 MOVD R8,R9 97 BLT setup8a // optimize < 32 98 MOVD $16,R10 // set offsets to load into vectors 99 CMP R8,$64 100 BLT cmp32 // process size 32-63 101 102 DCBT (R5) // optimize >= 64 103 DCBT (R6) // cache hint 104 MOVD $32,R11 // set offsets to load into vector 105 MOVD $48,R12 // set offsets to load into vector 106 107 loop64a:// process size 64 and greater 108 LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector 109 LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector 110 VCMPEQUDCC V3,V4,V1 111 BGE CR6,different // jump out if its different 112 113 LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector 114 LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector 115 116 VCMPEQUDCC V3,V4,V1 117 BGE CR6,different 118 119 LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector 120 LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector 121 122 VCMPEQUDCC V3,V4,V1 123 BGE CR6,different 124 125 LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector 126 LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector 127 128 VCMPEQUDCC V3,V4,V1 129 BGE CR6,different 130 131 ADD $-64,R9,R9 // reduce remaining size by 64 132 ADD $64,R5,R5 // increment to next 64 bytes of A 133 ADD $64,R6,R6 // increment to next 64 bytes of B 134 CMPU R9,$64 135 BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining 136 137 CMPU R9,$32 138 BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining 139 CMPU R9,$0 140 BNE rem // loop to rem if the remainder is not 0 141 142 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 143 BLT CR2,less // jump to less if len(A)<len(B) 144 BR greater // jump to greater otherwise 145 cmp32: 146 LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector 147 LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector 148 149 VCMPEQUDCC V3,V4,V1 150 BGE CR6,different 151 152 LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector 153 LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector 154 155 VCMPEQUDCC V3,V4,V1 156 BGE CR6,different 157 158 ADD $-32,R9,R9 // reduce remaining size by 32 159 ADD $32,R5,R5 // increment to next 32 bytes of A 160 ADD $32,R6,R6 // increment to next 32 bytes of B 161 CMPU R9,$0 162 BNE rem // loop to rem if the remainder is not 0 163 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 164 BLT CR2,less // jump to less if len(A)<len(B) 165 BR greater // jump to greater otherwise 166 rem: 167 MOVD R9,R8 168 ANDCC $24,R8,R9 // Any 8 byte chunks? 169 BEQ leftover // and result is 0 170 BR setup8a 171 172 different: 173 #ifdef GOARCH_ppc64le 174 MOVD $byteswap<>+00(SB), R16 175 LXVD2X (R16)(R0),SWAP // Set up swap string 176 177 VPERM V3,V3,SWAP,V3 178 VPERM V4,V4,SWAP,V4 179 #endif 180 MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison 181 MFVSRD VS36,R10 182 183 CMPU R16,R10 184 BEQ lower 185 BGT greater 186 MOVD $-1,R3 // return value if A < B 187 RET 188 lower: 189 VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison 190 MFVSRD VS35,R16 191 VSLDOI $8,V4,V4,V4 192 MFVSRD VS36,R10 193 194 CMPU R16,R10 195 BGT greater 196 MOVD $-1,R3 // return value if A < B 197 RET 198 setup8a: 199 SRADCC $3,R8,R9 // get the 8 byte count 200 BEQ leftover // shifted value is 0 201 CMPU R8,$8 // optimize 8byte move 202 BEQ size8 203 CMPU R8,$16 204 BEQ size16 205 MOVD R9,CTR // loop count for doublewords 206 loop8: 207 #ifdef GOARCH_ppc64le 208 MOVDBR (R5+R0),R16 // doublewords to compare 209 MOVDBR (R6+R0),R10 // LE compare order 210 #else 211 MOVD (R5+R0),R16 // doublewords to compare 212 MOVD (R6+R0),R10 // BE compare order 213 #endif 214 ADD $8,R5 215 ADD $8,R6 216 CMPU R16,R10 // match? 217 BC 8,2,loop8 // bt ctr <> 0 && cr 218 BGT greater 219 BLT less 220 leftover: 221 ANDCC $7,R8,R9 // check for leftover bytes 222 BEQ zeroremainder 223 simplecheck: 224 MOVD R0,R14 225 CMP R9,$4 // process 4 bytes 226 BLT halfword 227 #ifdef GOARCH_ppc64le 228 MOVWBR (R5)(R14),R10 229 MOVWBR (R6)(R14),R11 230 #else 231 MOVWZ (R5)(R14),R10 232 MOVWZ (R6)(R14),R11 233 #endif 234 CMPU R10,R11 235 BGT greater 236 BLT less 237 ADD $-4,R9 238 ADD $4,R14 239 PCALIGN $16 240 241 halfword: 242 CMP R9,$2 // process 2 bytes 243 BLT byte 244 #ifdef GOARCH_ppc64le 245 MOVHBR (R5)(R14),R10 246 MOVHBR (R6)(R14),R11 247 #else 248 MOVHZ (R5)(R14),R10 249 MOVHZ (R6)(R14),R11 250 #endif 251 CMPU R10,R11 252 BGT greater 253 BLT less 254 ADD $-2,R9 255 ADD $2,R14 256 PCALIGN $16 257 byte: 258 CMP R9,$0 // process 1 byte 259 BEQ skip 260 MOVBZ (R5)(R14),R10 261 MOVBZ (R6)(R14),R11 262 CMPU R10,R11 263 BGT greater 264 BLT less 265 PCALIGN $16 266 skip: 267 BEQ CR2,equal 268 BGT CR2,greater 269 270 less: MOVD $-1,R3 // return value if A < B 271 RET 272 size16: 273 LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector 274 LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector 275 VCMPEQUDCC V3,V4,V1 276 BGE CR6,different 277 zeroremainder: 278 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 279 BLT CR2,less // jump to less if len(A)<len(B) 280 BR greater // jump to greater otherwise 281 size8: 282 #ifdef GOARCH_ppc64le 283 MOVDBR (R5+R0),R16 // doublewords to compare 284 MOVDBR (R6+R0),R10 // LE compare order 285 #else 286 MOVD (R5+R0),R16 // doublewords to compare 287 MOVD (R6+R0),R10 // BE compare order 288 #endif 289 CMPU R16,R10 // match? 290 BGT greater 291 BLT less 292 BGT CR2,greater // 2nd len > 1st len 293 BLT CR2,less // 2nd len < 1st len 294 equal: 295 MOVD $0, R3 // return value if A == B 296 RET 297 greater: 298 MOVD $1,R3 // return value if A > B 299 RET 300 301 // Do an efficient memcmp for ppc64le/ppc64/POWER9 302 // R3 = a len 303 // R4 = b len 304 // R5 = a addr 305 // R6 = b addr 306 // On exit: 307 // R3 = return value 308 TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 309 MOVD R3,R8 // set up length 310 CMP R3,R4,CR2 // unequal? 311 BLT CR2,setuplen // BLT CR2 312 MOVD R4,R8 // use R4 for comparison len 313 setuplen: 314 CMP R8,$16 // optimize for size<16 315 MOVD R8,R9 316 BLT simplecheck 317 MOVD $16,R10 // set offsets to load into vectors 318 CMP R8,$32 // optimize for size 16-31 319 BLT cmp16 320 CMP R8,$64 321 BLT cmp32 // optimize for size 32-63 322 DCBT (R5) // optimize for size>=64 323 DCBT (R6) // cache hint 324 325 MOVD $32,R11 // set offsets to load into vector 326 MOVD $48,R12 // set offsets to load into vector 327 328 loop64a:// process size 64 and greater 329 LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector 330 LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector 331 VCMPNEBCC V3,V4,V1 // record comparison into V1 332 BNE CR6,different // jump out if its different 333 334 LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector 335 LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector 336 VCMPNEBCC V3,V4,V1 337 BNE CR6,different 338 339 LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector 340 LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector 341 VCMPNEBCC V3,V4,V1 342 BNE CR6,different 343 344 LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector 345 LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector 346 VCMPNEBCC V3,V4,V1 347 BNE CR6,different 348 349 ADD $-64,R9,R9 // reduce remaining size by 64 350 ADD $64,R5,R5 // increment to next 64 bytes of A 351 ADD $64,R6,R6 // increment to next 64 bytes of B 352 CMPU R9,$64 353 BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining 354 355 CMPU R9,$32 356 BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining 357 CMPU R9,$16 358 BGE cmp16 // loop to cmp16 if there are 16-31 bytes left 359 CMPU R9,$0 360 BNE simplecheck // loop to simplecheck for remaining bytes 361 362 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 363 BLT CR2,less // jump to less if len(A)<len(B) 364 BR greater // jump to greater otherwise 365 cmp32: 366 LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector 367 LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector 368 369 VCMPNEBCC V3,V4,V1 // record comparison into V1 370 BNE CR6,different // jump out if its different 371 372 LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector 373 LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector 374 VCMPNEBCC V3,V4,V1 375 BNE CR6,different 376 377 ADD $-32,R9,R9 // reduce remaining size by 32 378 ADD $32,R5,R5 // increment to next 32 bytes of A 379 ADD $32,R6,R6 // increment to next 32 bytes of B 380 CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left 381 BGE cmp16 382 CMPU R9,$0 383 BNE simplecheck // loop to simplecheck for remainder bytes 384 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 385 BLT CR2,less // jump to less if len(A)<len(B) 386 BR greater // jump to greater otherwise 387 different: 388 389 MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison 390 MFVSRD VS36,R10 391 392 CMPU R16,R10 393 BEQ lower 394 BGT greater 395 MOVD $-1,R3 // return value if A < B 396 RET 397 lower: 398 MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison 399 MFVSRLD VS36,R10 400 401 CMPU R16,R10 402 BGT greater 403 MOVD $-1,R3 // return value if A < B 404 RET 405 406 greater: 407 MOVD $1,R3 // return value if A > B 408 RET 409 cmp16: 410 ANDCC $16,R9,R31 411 BEQ tail 412 413 LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector 414 LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector 415 VCMPEQUDCC V3,V4,V1 416 BGE CR6,different 417 418 ADD $16,R5 419 ADD $16,R6 420 tail: 421 ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) 422 BEQ end 423 424 ADD R9,R5 425 ADD R9,R6 426 MOVD $-16,R10 427 428 LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector 429 LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector 430 VCMPEQUDCC V3,V4,V1 431 BGE CR6,different 432 end: 433 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) 434 BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B) 435 BR greater // jump to greater otherwise 436 simplecheck: 437 MOVD $0,R14 // process 8 bytes 438 CMP R9,$8 439 BLT word 440 #ifdef GOARCH_ppc64le 441 MOVDBR (R5+R14),R10 442 MOVDBR (R6+R14),R11 443 #else 444 MOVD (R5+R14),R10 445 MOVD (R6+R14),R11 446 #endif 447 CMPU R10,R11 448 BGT greater 449 BLT less 450 ADD $8,R14 451 ADD $-8,R9 452 PCALIGN $16 453 word: 454 CMP R9,$4 // process 4 bytes 455 BLT halfword 456 #ifdef GOARCH_ppc64le 457 MOVWBR (R5+R14),R10 458 MOVWBR (R6+R14),R11 459 #else 460 MOVWZ (R5+R14),R10 461 MOVWZ (R6+R14),R11 462 #endif 463 CMPU R10,R11 464 BGT greater 465 BLT less 466 ADD $4,R14 467 ADD $-4,R9 468 PCALIGN $16 469 halfword: 470 CMP R9,$2 // process 2 bytes 471 BLT byte 472 #ifdef GOARCH_ppc64le 473 MOVHBR (R5+R14),R10 474 MOVHBR (R6+R14),R11 475 #else 476 MOVHZ (R5+R14),R10 477 MOVHZ (R6+R14),R11 478 #endif 479 CMPU R10,R11 480 BGT greater 481 BLT less 482 ADD $2,R14 483 ADD $-2,R9 484 PCALIGN $16 485 byte: 486 CMP R9,$0 // process 1 byte 487 BEQ skip 488 MOVBZ (R5+R14),R10 489 MOVBZ (R6+R14),R11 490 CMPU R10,R11 491 BGT greater 492 BLT less 493 PCALIGN $16 494 skip: 495 BEQ CR2,equal 496 BGT CR2,greater 497 less: 498 MOVD $-1,R3 // return value if A < B 499 RET 500 equal: 501 MOVD $0, R3 // return value if A == B 502 RET