github.com/zebozhuang/go@v0.0.0-20200207033046-f8a98f6f5c5d/src/hash/crc32/crc32_ppc64le.s (about) 1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // The vectorized implementation found below is a derived work 6 // from code written by Anton Blanchard <anton@au.ibm.com> found 7 // at https://github.com/antonblanchard/crc32-vpmsum. The original 8 // is dual licensed under GPL and Apache 2. As the copyright holder 9 // for the work, IBM has contributed this new work under 10 // the golang license. 11 12 // Changes include porting to Go assembler with modifications for 13 // the Go ABI for ppc64le. 14 15 #include "textflag.h" 16 17 #define POWER8_OFFSET 132 18 19 #define off16 R16 20 #define off32 R17 21 #define off48 R18 22 #define off64 R19 23 #define off80 R20 24 #define off96 R21 25 #define off112 R22 26 27 #define const1 V24 28 #define const2 V25 29 30 #define byteswap V26 31 #define mask_32bit V27 32 #define mask_64bit V28 33 #define zeroes V29 34 35 #define MAX_SIZE 32*1024 36 #define REFLECT 37 38 TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44 39 MOVWZ crc+0(FP), R3 // incoming crc 40 MOVD table8+8(FP), R4 // *Table 41 MOVD p+16(FP), R5 42 MOVD p_len+24(FP), R6 // p len 43 44 CMP $0,R6 // len == 0? 45 BNE start 46 MOVW R3,ret+40(FP) // return crc 47 RET 48 49 start: 50 NOR R3,R3,R7 // ^crc 51 MOVWZ R7,R7 // 32 bits 52 CMP R6,$16 53 MOVD R6,CTR 54 BLT short 55 SRAD $3,R6,R8 // 8 byte chunks 56 MOVD R8,CTR 57 58 loop: 59 MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian? 60 MOVWZ 4(R5),R9 // 4-7 bytes of p 61 MOVD R4,R10 // &tab[0] 62 XOR R7,R8,R7 // crc ^= byte[0:3] 63 RLDICL $40,R9,$56,R17 // p[7] 64 SLD $2,R17,R17 // p[7]*4 65 RLDICL $40,R7,$56,R8 // crc>>24 66 ADD R17,R10,R17 // &tab[0][p[7]] 67 SLD $2,R8,R8 // crc>>24*4 68 RLDICL $48,R9,$56,R18 // p[6] 69 SLD $2,R18,R18 // p[6]*4 70 ADD $1024,R10,R10 // tab[1] 71 MOVWZ 0(R17),R21 // tab[0][p[7]] 72 RLDICL $56,R9,$56,R19 // p[5] 73 ADD R10,R18,R18 // &tab[1][p[6]] 74 SLD $2,R19,R19 // p[5]*4:1 75 MOVWZ 0(R18),R22 // tab[1][p[6]] 76 ADD $1024,R10,R10 // tab[2] 77 XOR R21,R22,R21 // xor done R22 78 ADD R19,R10,R19 // &tab[2][p[5]] 79 ANDCC $255,R9,R20 // p[4] ?? 80 SLD $2,R20,R20 // p[4]*4 81 MOVWZ 0(R19),R23 // tab[2][p[5]] 82 ADD $1024,R10,R10 // &tab[3] 83 ADD R20,R10,R20 // tab[3][p[4]] 84 XOR R21,R23,R21 // xor done R23 85 ADD $1024,R10,R10 // &tab[4] 86 MOVWZ 0(R20),R24 // tab[3][p[4]] 87 ADD R10,R8,R23 // &tab[4][crc>>24] 88 XOR R21,R24,R21 // xor done R24 89 MOVWZ 0(R23),R25 // tab[4][crc>>24] 90 RLDICL $48,R7,$56,R24 // crc>>16&0xFF 91 XOR R21,R25,R21 // xor done R25 92 ADD $1024,R10,R10 // &tab[5] 93 SLD $2,R24,R24 // crc>>16&0xFF*4 94 ADD R24,R10,R24 // &tab[5][crc>>16&0xFF] 95 MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF] 96 XOR R21,R26,R21 // xor done R26 97 RLDICL $56,R7,$56,R25 // crc>>8 98 ADD $1024,R10,R10 // &tab[6] 99 SLD $2,R25,R25 // crc>>8&FF*2 100 ADD R25,R10,R25 // &tab[6][crc>>8&0xFF] 101 MOVBZ R7,R26 // crc&0xFF 102 ADD $1024,R10,R10 // &tab[7] 103 MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF] 104 SLD $2,R26,R26 // crc&0xFF*2 105 XOR R21,R27,R21 // xor done R27 106 ADD R26,R10,R26 // &tab[7][crc&0xFF] 107 ADD $8,R5 // p = p[8:] 108 MOVWZ 0(R26),R28 // tab[7][crc&0xFF] 109 XOR R21,R28,R21 // xor done R28 110 MOVWZ R21,R7 // crc for next round 111 BC 16,0,loop // next 8 bytes 112 ANDCC $7,R6,R8 // any leftover bytes 113 BEQ done // none --> done 114 MOVD R8,CTR // byte count 115 116 short: 117 MOVBZ 0(R5),R8 // get v 118 MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE? 119 MOVWZ R7,R14 120 SRD $8,R14,R14 // crc>>8 121 XOR R8,R9,R8 // byte(crc)^v -> R8 122 ADD $1,R5 // ptr to next v 123 SLD $2,R8 // convert index-> bytes 124 ADD R8,R4,R9 // &tab[byte(crc)^v] 125 MOVWZ 0(R9),R10 // tab[byte(crc)^v] 126 XOR R10,R14,R7 // loop crc in R7 127 MOVWZ R7,R7 // 32 bits 128 BC 16,0,short 129 done: 130 NOR R7,R7,R7 // ^crc 131 MOVW R7,ret+40(FP) // return crc 132 RET 133 134 #ifdef BYTESWAP_DATA 135 DATA ·byteswapcons+0(SB)/8,$0x0706050403020100 136 DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908 137 138 GLOBL ·byteswapcons+0(SB),RODATA,$16 139 #endif 140 141 TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36 142 MOVWZ crc+0(FP), R3 // incoming crc 143 MOVWZ ctab+4(FP), R14 // crc poly id 144 MOVD p+8(FP), R4 145 MOVD p_len+16(FP), R5 // p len 146 147 // R3 = incoming crc 148 // R14 = constant table identifier 149 // R5 = address of bytes 150 // R6 = length of bytes 151 152 // defines for index loads 153 154 MOVD $16,off16 155 MOVD $32,off32 156 MOVD $48,off48 157 MOVD $64,off64 158 MOVD $80,off80 159 MOVD $96,off96 160 MOVD $112,off112 161 MOVD $0,R15 162 163 MOVD R3,R10 // save initial crc 164 165 NOR R3,R3,R3 // ^crc 166 MOVWZ R3,R3 // 32 bits 167 VXOR zeroes,zeroes,zeroes // clear the V reg 168 VSPLTISW $-1,V0 169 VSLDOI $4,V29,V0,mask_32bit 170 VSLDOI $8,V29,V0,mask_64bit 171 172 VXOR V8,V8,V8 173 MTVSRD R3,VS40 // crc initial value VS40 = V8 174 175 #ifdef REFLECT 176 VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits? 177 #else 178 VSLDOI $4,V8,zeroes,V8 179 #endif 180 181 #ifdef BYTESWAP_DATA 182 MOVD $·byteswapcons(SB),R3 183 LVX (R3),byteswap 184 #endif 185 186 CMPU R5,$256 // length of bytes 187 BLT short 188 189 RLDICR $0,R5,$56,R6 // chunk to process 190 191 // First step for larger sizes 192 l1: MOVD $32768,R7 193 MOVD R7,R9 194 CMP R6,R7 // compare R6, R7 (MAX SIZE) 195 BGT top // less than MAX, just do remainder 196 MOVD R6,R7 197 top: 198 SUB R7,R6,R6 199 200 // mainloop does 128 bytes at a time 201 SRD $7,R7 202 203 // determine the offset into the constants table to start with. 204 // Each constant is 128 bytes, used against 16 bytes of data. 205 SLD $4,R7,R8 206 SRD $3,R9,R9 207 SUB R8,R9,R8 208 209 // The last iteration is reduced in a separate step 210 ADD $-1,R7 211 MOVD R7,CTR 212 213 // Determine which constant table (depends on poly) 214 CMP R14,$1 215 BNE castTable 216 MOVD $·IEEEConst(SB),R3 217 BR startConst 218 castTable: 219 MOVD $·CastConst(SB),R3 220 221 startConst: 222 ADD R3,R8,R3 // starting point in constants table 223 224 VXOR V0,V0,V0 // clear the V regs 225 VXOR V1,V1,V1 226 VXOR V2,V2,V2 227 VXOR V3,V3,V3 228 VXOR V4,V4,V4 229 VXOR V5,V5,V5 230 VXOR V6,V6,V6 231 VXOR V7,V7,V7 232 233 LVX (R3),const1 // loading constant values 234 235 CMP R15,$1 // Identify warm up pass 236 BEQ next 237 238 // First warm up pass: load the bytes to process 239 LVX (R4),V16 240 LVX (R4+off16),V17 241 LVX (R4+off32),V18 242 LVX (R4+off48),V19 243 LVX (R4+off64),V20 244 LVX (R4+off80),V21 245 LVX (R4+off96),V22 246 LVX (R4+off112),V23 247 ADD $128,R4 // bump up to next 128 bytes in buffer 248 249 VXOR V16,V8,V16 // xor in initial CRC in V8 250 251 next: 252 BC 18,0,first_warm_up_done 253 254 ADD $16,R3 // bump up to next constants 255 LVX (R3),const2 // table values 256 257 VPMSUMD V16,const1,V8 // second warm up pass 258 LVX (R4),V16 // load from buffer 259 OR $0,R2,R2 260 261 VPMSUMD V17,const1,V9 // vpmsumd with constants 262 LVX (R4+off16),V17 // load next from buffer 263 OR $0,R2,R2 264 265 VPMSUMD V18,const1,V10 // vpmsumd with constants 266 LVX (R4+off32),V18 // load next from buffer 267 OR $0,R2,R2 268 269 VPMSUMD V19,const1,V11 // vpmsumd with constants 270 LVX (R4+off48),V19 // load next from buffer 271 OR $0,R2,R2 272 273 VPMSUMD V20,const1,V12 // vpmsumd with constants 274 LVX (R4+off64),V20 // load next from buffer 275 OR $0,R2,R2 276 277 VPMSUMD V21,const1,V13 // vpmsumd with constants 278 LVX (R4+off80),V21 // load next from buffer 279 OR $0,R2,R2 280 281 VPMSUMD V22,const1,V14 // vpmsumd with constants 282 LVX (R4+off96),V22 // load next from buffer 283 OR $0,R2,R2 284 285 VPMSUMD V23,const1,V15 // vpmsumd with constants 286 LVX (R4+off112),V23 // load next from buffer 287 288 ADD $128,R4 // bump up to next 128 bytes in buffer 289 290 BC 18,0,first_cool_down 291 292 cool_top: 293 LVX (R3),const1 // constants 294 ADD $16,R3 // inc to next constants 295 OR $0,R2,R2 296 297 VXOR V0,V8,V0 // xor in previous vpmsumd 298 VPMSUMD V16,const2,V8 // vpmsumd with constants 299 LVX (R4),V16 // buffer 300 OR $0,R2,R2 301 302 VXOR V1,V9,V1 // xor in previous 303 VPMSUMD V17,const2,V9 // vpmsumd with constants 304 LVX (R4+off16),V17 // next in buffer 305 OR $0,R2,R2 306 307 VXOR V2,V10,V2 // xor in previous 308 VPMSUMD V18,const2,V10 // vpmsumd with constants 309 LVX (R4+off32),V18 // next in buffer 310 OR $0,R2,R2 311 312 VXOR V3,V11,V3 // xor in previous 313 VPMSUMD V19,const2,V11 // vpmsumd with constants 314 LVX (R4+off48),V19 // next in buffer 315 LVX (R3),const2 // get next constant 316 OR $0,R2,R2 317 318 VXOR V4,V12,V4 // xor in previous 319 VPMSUMD V20,const1,V12 // vpmsumd with constants 320 LVX (R4+off64),V20 // next in buffer 321 OR $0,R2,R2 322 323 VXOR V5,V13,V5 // xor in previous 324 VPMSUMD V21,const1,V13 // vpmsumd with constants 325 LVX (R4+off80),V21 // next in buffer 326 OR $0,R2,R2 327 328 VXOR V6,V14,V6 // xor in previous 329 VPMSUMD V22,const1,V14 // vpmsumd with constants 330 LVX (R4+off96),V22 // next in buffer 331 OR $0,R2,R2 332 333 VXOR V7,V15,V7 // xor in previous 334 VPMSUMD V23,const1,V15 // vpmsumd with constants 335 LVX (R4+off112),V23 // next in buffer 336 337 ADD $128,R4 // bump up buffer pointer 338 BC 16,0,cool_top // are we done? 339 340 first_cool_down: 341 342 // load the constants 343 // xor in the previous value 344 // vpmsumd the result with constants 345 346 LVX (R3),const1 347 ADD $16,R3 348 349 VXOR V0,V8,V0 350 VPMSUMD V16,const1,V8 351 OR $0,R2,R2 352 353 VXOR V1,V9,V1 354 VPMSUMD V17,const1,V9 355 OR $0,R2,R2 356 357 VXOR V2,V10,V2 358 VPMSUMD V18,const1,V10 359 OR $0,R2,R2 360 361 VXOR V3,V11,V3 362 VPMSUMD V19,const1,V11 363 OR $0,R2,R2 364 365 VXOR V4,V12,V4 366 VPMSUMD V20,const1,V12 367 OR $0,R2,R2 368 369 VXOR V5,V13,V5 370 VPMSUMD V21,const1,V13 371 OR $0,R2,R2 372 373 VXOR V6,V14,V6 374 VPMSUMD V22,const1,V14 375 OR $0,R2,R2 376 377 VXOR V7,V15,V7 378 VPMSUMD V23,const1,V15 379 OR $0,R2,R2 380 381 second_cool_down: 382 383 VXOR V0,V8,V0 384 VXOR V1,V9,V1 385 VXOR V2,V10,V2 386 VXOR V3,V11,V3 387 VXOR V4,V12,V4 388 VXOR V5,V13,V5 389 VXOR V6,V14,V6 390 VXOR V7,V15,V7 391 392 #ifdef REFLECT 393 VSLDOI $4,V0,zeroes,V0 394 VSLDOI $4,V1,zeroes,V1 395 VSLDOI $4,V2,zeroes,V2 396 VSLDOI $4,V3,zeroes,V3 397 VSLDOI $4,V4,zeroes,V4 398 VSLDOI $4,V5,zeroes,V5 399 VSLDOI $4,V6,zeroes,V6 400 VSLDOI $4,V7,zeroes,V7 401 #endif 402 403 LVX (R4),V8 404 LVX (R4+off16),V9 405 LVX (R4+off32),V10 406 LVX (R4+off48),V11 407 LVX (R4+off64),V12 408 LVX (R4+off80),V13 409 LVX (R4+off96),V14 410 LVX (R4+off112),V15 411 412 ADD $128,R4 413 414 VXOR V0,V8,V16 415 VXOR V1,V9,V17 416 VXOR V2,V10,V18 417 VXOR V3,V11,V19 418 VXOR V4,V12,V20 419 VXOR V5,V13,V21 420 VXOR V6,V14,V22 421 VXOR V7,V15,V23 422 423 MOVD $1,R15 424 CMP $0,R6 425 ADD $128,R6 426 427 BNE l1 428 ANDCC $127,R5 429 SUBC R5,$128,R6 430 ADD R3,R6,R3 431 432 SRD $4,R5,R7 433 MOVD R7,CTR 434 LVX (R3),V0 435 LVX (R3+off16),V1 436 LVX (R3+off32),V2 437 LVX (R3+off48),V3 438 LVX (R3+off64),V4 439 LVX (R3+off80),V5 440 LVX (R3+off96),V6 441 LVX (R3+off112),V7 442 443 ADD $128,R3 444 445 VPMSUMW V16,V0,V0 446 VPMSUMW V17,V1,V1 447 VPMSUMW V18,V2,V2 448 VPMSUMW V19,V3,V3 449 VPMSUMW V20,V4,V4 450 VPMSUMW V21,V5,V5 451 VPMSUMW V22,V6,V6 452 VPMSUMW V23,V7,V7 453 454 // now reduce the tail 455 456 CMP $0,R7 457 BEQ next1 458 459 LVX (R4),V16 460 LVX (R3),V17 461 VPMSUMW V16,V17,V16 462 VXOR V0,V16,V0 463 BC 18,0,next1 464 465 LVX (R4+off16),V16 466 LVX (R3+off16),V17 467 VPMSUMW V16,V17,V16 468 VXOR V0,V16,V0 469 BC 18,0,next1 470 471 LVX (R4+off32),V16 472 LVX (R3+off32),V17 473 VPMSUMW V16,V17,V16 474 VXOR V0,V16,V0 475 BC 18,0,next1 476 477 LVX (R4+off48),V16 478 LVX (R3+off48),V17 479 VPMSUMW V16,V17,V16 480 VXOR V0,V16,V0 481 BC 18,0,next1 482 483 LVX (R4+off64),V16 484 LVX (R3+off64),V17 485 VPMSUMW V16,V17,V16 486 VXOR V0,V16,V0 487 BC 18,0,next1 488 489 LVX (R4+off80),V16 490 LVX (R3+off80),V17 491 VPMSUMW V16,V17,V16 492 VXOR V0,V16,V0 493 BC 18,0,next1 494 495 LVX (R4+off96),V16 496 LVX (R3+off96),V17 497 VPMSUMW V16,V17,V16 498 VXOR V0,V16,V0 499 500 next1: 501 VXOR V0,V1,V0 502 VXOR V2,V3,V2 503 VXOR V4,V5,V4 504 VXOR V6,V7,V6 505 VXOR V0,V2,V0 506 VXOR V4,V6,V4 507 VXOR V0,V4,V0 508 509 barrett_reduction: 510 511 CMP R14,$1 512 BNE barcstTable 513 MOVD $·IEEEBarConst(SB),R3 514 BR startbarConst 515 barcstTable: 516 MOVD $·CastBarConst(SB),R3 517 518 startbarConst: 519 LVX (R3),const1 520 LVX (R3+off16),const2 521 522 VSLDOI $8,V0,V0,V1 523 VXOR V0,V1,V0 524 525 #ifdef REFLECT 526 VSPLTISB $1,V1 527 VSL V0,V1,V0 528 #endif 529 530 VAND V0,mask_64bit,V0 531 532 #ifndef REFLECT 533 534 VPMSUMD V0,const1,V1 535 VSLDOI $8,zeroes,V1,V1 536 VPMSUMD V1,const2,V1 537 VXOR V0,V1,V0 538 VSLDOI $8,V0,zeroes,V0 539 540 #else 541 542 VAND V0,mask_32bit,V1 543 VPMSUMD V1,const1,V1 544 VAND V1,mask_32bit,V1 545 VPMSUMD V1,const2,V1 546 VXOR V0,V1,V0 547 VSLDOI $4,V0,zeroes,V0 548 549 #endif 550 551 MFVSRD VS32,R3 // VS32 = V0 552 553 NOR R3,R3,R3 // return ^crc 554 MOVW R3,ret+32(FP) 555 RET 556 557 first_warm_up_done: 558 559 LVX (R3),const1 560 ADD $16,R3 561 562 VPMSUMD V16,const1,V8 563 VPMSUMD V17,const1,V9 564 VPMSUMD V18,const1,V10 565 VPMSUMD V19,const1,V11 566 VPMSUMD V20,const1,V12 567 VPMSUMD V21,const1,V13 568 VPMSUMD V22,const1,V14 569 VPMSUMD V23,const1,V15 570 571 BR second_cool_down 572 573 short: 574 CMP $0,R5 575 BEQ zero 576 577 // compute short constants 578 579 CMP R14,$1 580 BNE castshTable 581 MOVD $·IEEEConst(SB),R3 582 ADD $4080,R3 583 BR startshConst 584 castshTable: 585 MOVD $·CastConst(SB),R3 586 ADD $4080,R3 587 588 startshConst: 589 SUBC R5,$256,R6 // sub from 256 590 ADD R3,R6,R3 591 592 // calculate where to start 593 594 SRD $4,R5,R7 595 MOVD R7,CTR 596 597 VXOR V19,V19,V19 598 VXOR V20,V20,V20 599 600 LVX (R4),V0 601 LVX (R3),V16 602 VXOR V0,V8,V0 603 VPMSUMW V0,V16,V0 604 BC 18,0,v0 605 606 LVX (R4+off16),V1 607 LVX (R3+off16),V17 608 VPMSUMW V1,V17,V1 609 BC 18,0,v1 610 611 LVX (R4+off32),V2 612 LVX (R3+off32),V16 613 VPMSUMW V2,V16,V2 614 BC 18,0,v2 615 616 LVX (R4+off48),V3 617 LVX (R3+off48),V17 618 VPMSUMW V3,V17,V3 619 BC 18,0,v3 620 621 LVX (R4+off64),V4 622 LVX (R3+off64),V16 623 VPMSUMW V4,V16,V4 624 BC 18,0,v4 625 626 LVX (R4+off80),V5 627 LVX (R3+off80),V17 628 VPMSUMW V5,V17,V5 629 BC 18,0,v5 630 631 LVX (R4+off96),V6 632 LVX (R3+off96),V16 633 VPMSUMW V6,V16,V6 634 BC 18,0,v6 635 636 LVX (R4+off112),V7 637 LVX (R3+off112),V17 638 VPMSUMW V7,V17,V7 639 BC 18,0,v7 640 641 ADD $128,R3 642 ADD $128,R4 643 644 LVX (R4),V8 645 LVX (R3),V16 646 VPMSUMW V8,V16,V8 647 BC 18,0,v8 648 649 LVX (R4+off16),V9 650 LVX (R3+off16),V17 651 VPMSUMW V9,V17,V9 652 BC 18,0,v9 653 654 LVX (R4+off32),V10 655 LVX (R3+off32),V16 656 VPMSUMW V10,V16,V10 657 BC 18,0,v10 658 659 LVX (R4+off48),V11 660 LVX (R3+off48),V17 661 VPMSUMW V11,V17,V11 662 BC 18,0,v11 663 664 LVX (R4+off64),V12 665 LVX (R3+off64),V16 666 VPMSUMW V12,V16,V12 667 BC 18,0,v12 668 669 LVX (R4+off80),V13 670 LVX (R3+off80),V17 671 VPMSUMW V13,V17,V13 672 BC 18,0,v13 673 674 LVX (R4+off96),V14 675 LVX (R3+off96),V16 676 VPMSUMW V14,V16,V14 677 BC 18,0,v14 678 679 LVX (R4+off112),V15 680 LVX (R3+off112),V17 681 VPMSUMW V15,V17,V15 682 683 VXOR V19,V15,V19 684 v14: VXOR V20,V14,V20 685 v13: VXOR V19,V13,V19 686 v12: VXOR V20,V12,V20 687 v11: VXOR V19,V11,V19 688 v10: VXOR V20,V10,V20 689 v9: VXOR V19,V9,V19 690 v8: VXOR V20,V8,V20 691 v7: VXOR V19,V7,V19 692 v6: VXOR V20,V6,V20 693 v5: VXOR V19,V5,V19 694 v4: VXOR V20,V4,V20 695 v3: VXOR V19,V3,V19 696 v2: VXOR V20,V2,V20 697 v1: VXOR V19,V1,V19 698 v0: VXOR V20,V0,V20 699 700 VXOR V19,V20,V0 701 702 BR barrett_reduction 703 704 zero: 705 // This case is the original crc, so just return it 706 MOVW R10,ret+32(FP) 707 RET