github.com/bir3/gocompiler@v0.9.2202/extra/compress/huff0/decompress_amd64.s (about) 1 // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. 2 3 //go:build amd64 && !appengine && !noasm && gc 4 5 // func decompress4x_main_loop_amd64(ctx *decompress4xContext) 6 TEXT ·decompress4x_main_loop_amd64(SB), $0-8 7 // Preload values 8 MOVQ ctx+0(FP), AX 9 MOVBQZX 8(AX), DI 10 MOVQ 16(AX), BX 11 MOVQ 48(AX), SI 12 MOVQ 24(AX), R8 13 MOVQ 32(AX), R9 14 MOVQ (AX), R10 15 16 // Main loop 17 main_loop: 18 XORL DX, DX 19 CMPQ BX, SI 20 SETGE DL 21 22 // br0.fillFast32() 23 MOVQ 32(R10), R11 24 MOVBQZX 40(R10), R12 25 CMPQ R12, $0x20 26 JBE skip_fill0 27 MOVQ 24(R10), AX 28 SUBQ $0x20, R12 29 SUBQ $0x04, AX 30 MOVQ (R10), R13 31 32 // b.value |= uint64(low) << (b.bitsRead & 63) 33 MOVL (AX)(R13*1), R13 34 MOVQ R12, CX 35 SHLQ CL, R13 36 MOVQ AX, 24(R10) 37 ORQ R13, R11 38 39 // exhausted += (br0.off < 4) 40 CMPQ AX, $0x04 41 ADCB $+0, DL 42 43 skip_fill0: 44 // val0 := br0.peekTopBits(peekBits) 45 MOVQ R11, R13 46 MOVQ DI, CX 47 SHRQ CL, R13 48 49 // v0 := table[val0&mask] 50 MOVW (R9)(R13*2), CX 51 52 // br0.advance(uint8(v0.entry) 53 MOVB CH, AL 54 SHLQ CL, R11 55 ADDB CL, R12 56 57 // val1 := br0.peekTopBits(peekBits) 58 MOVQ DI, CX 59 MOVQ R11, R13 60 SHRQ CL, R13 61 62 // v1 := table[val1&mask] 63 MOVW (R9)(R13*2), CX 64 65 // br0.advance(uint8(v1.entry)) 66 MOVB CH, AH 67 SHLQ CL, R11 68 ADDB CL, R12 69 70 // these two writes get coalesced 71 // out[id * dstEvery + 0] = uint8(v0.entry >> 8) 72 // out[id * dstEvery + 1] = uint8(v1.entry >> 8) 73 MOVW AX, (BX) 74 75 // update the bitreader structure 76 MOVQ R11, 32(R10) 77 MOVB R12, 40(R10) 78 79 // br1.fillFast32() 80 MOVQ 80(R10), R11 81 MOVBQZX 88(R10), R12 82 CMPQ R12, $0x20 83 JBE skip_fill1 84 MOVQ 72(R10), AX 85 SUBQ $0x20, R12 86 SUBQ $0x04, AX 87 MOVQ 48(R10), R13 88 89 // b.value |= uint64(low) << (b.bitsRead & 63) 90 MOVL (AX)(R13*1), R13 91 MOVQ R12, CX 92 SHLQ CL, R13 93 MOVQ AX, 72(R10) 94 ORQ R13, R11 95 96 // exhausted += (br1.off < 4) 97 CMPQ AX, $0x04 98 ADCB $+0, DL 99 100 skip_fill1: 101 // val0 := br1.peekTopBits(peekBits) 102 MOVQ R11, R13 103 MOVQ DI, CX 104 SHRQ CL, R13 105 106 // v0 := table[val0&mask] 107 MOVW (R9)(R13*2), CX 108 109 // br1.advance(uint8(v0.entry) 110 MOVB CH, AL 111 SHLQ CL, R11 112 ADDB CL, R12 113 114 // val1 := br1.peekTopBits(peekBits) 115 MOVQ DI, CX 116 MOVQ R11, R13 117 SHRQ CL, R13 118 119 // v1 := table[val1&mask] 120 MOVW (R9)(R13*2), CX 121 122 // br1.advance(uint8(v1.entry)) 123 MOVB CH, AH 124 SHLQ CL, R11 125 ADDB CL, R12 126 127 // these two writes get coalesced 128 // out[id * dstEvery + 0] = uint8(v0.entry >> 8) 129 // out[id * dstEvery + 1] = uint8(v1.entry >> 8) 130 MOVW AX, (BX)(R8*1) 131 132 // update the bitreader structure 133 MOVQ R11, 80(R10) 134 MOVB R12, 88(R10) 135 136 // br2.fillFast32() 137 MOVQ 128(R10), R11 138 MOVBQZX 136(R10), R12 139 CMPQ R12, $0x20 140 JBE skip_fill2 141 MOVQ 120(R10), AX 142 SUBQ $0x20, R12 143 SUBQ $0x04, AX 144 MOVQ 96(R10), R13 145 146 // b.value |= uint64(low) << (b.bitsRead & 63) 147 MOVL (AX)(R13*1), R13 148 MOVQ R12, CX 149 SHLQ CL, R13 150 MOVQ AX, 120(R10) 151 ORQ R13, R11 152 153 // exhausted += (br2.off < 4) 154 CMPQ AX, $0x04 155 ADCB $+0, DL 156 157 skip_fill2: 158 // val0 := br2.peekTopBits(peekBits) 159 MOVQ R11, R13 160 MOVQ DI, CX 161 SHRQ CL, R13 162 163 // v0 := table[val0&mask] 164 MOVW (R9)(R13*2), CX 165 166 // br2.advance(uint8(v0.entry) 167 MOVB CH, AL 168 SHLQ CL, R11 169 ADDB CL, R12 170 171 // val1 := br2.peekTopBits(peekBits) 172 MOVQ DI, CX 173 MOVQ R11, R13 174 SHRQ CL, R13 175 176 // v1 := table[val1&mask] 177 MOVW (R9)(R13*2), CX 178 179 // br2.advance(uint8(v1.entry)) 180 MOVB CH, AH 181 SHLQ CL, R11 182 ADDB CL, R12 183 184 // these two writes get coalesced 185 // out[id * dstEvery + 0] = uint8(v0.entry >> 8) 186 // out[id * dstEvery + 1] = uint8(v1.entry >> 8) 187 MOVW AX, (BX)(R8*2) 188 189 // update the bitreader structure 190 MOVQ R11, 128(R10) 191 MOVB R12, 136(R10) 192 193 // br3.fillFast32() 194 MOVQ 176(R10), R11 195 MOVBQZX 184(R10), R12 196 CMPQ R12, $0x20 197 JBE skip_fill3 198 MOVQ 168(R10), AX 199 SUBQ $0x20, R12 200 SUBQ $0x04, AX 201 MOVQ 144(R10), R13 202 203 // b.value |= uint64(low) << (b.bitsRead & 63) 204 MOVL (AX)(R13*1), R13 205 MOVQ R12, CX 206 SHLQ CL, R13 207 MOVQ AX, 168(R10) 208 ORQ R13, R11 209 210 // exhausted += (br3.off < 4) 211 CMPQ AX, $0x04 212 ADCB $+0, DL 213 214 skip_fill3: 215 // val0 := br3.peekTopBits(peekBits) 216 MOVQ R11, R13 217 MOVQ DI, CX 218 SHRQ CL, R13 219 220 // v0 := table[val0&mask] 221 MOVW (R9)(R13*2), CX 222 223 // br3.advance(uint8(v0.entry) 224 MOVB CH, AL 225 SHLQ CL, R11 226 ADDB CL, R12 227 228 // val1 := br3.peekTopBits(peekBits) 229 MOVQ DI, CX 230 MOVQ R11, R13 231 SHRQ CL, R13 232 233 // v1 := table[val1&mask] 234 MOVW (R9)(R13*2), CX 235 236 // br3.advance(uint8(v1.entry)) 237 MOVB CH, AH 238 SHLQ CL, R11 239 ADDB CL, R12 240 241 // these two writes get coalesced 242 // out[id * dstEvery + 0] = uint8(v0.entry >> 8) 243 // out[id * dstEvery + 1] = uint8(v1.entry >> 8) 244 LEAQ (R8)(R8*2), CX 245 MOVW AX, (BX)(CX*1) 246 247 // update the bitreader structure 248 MOVQ R11, 176(R10) 249 MOVB R12, 184(R10) 250 ADDQ $0x02, BX 251 TESTB DL, DL 252 JZ main_loop 253 MOVQ ctx+0(FP), AX 254 SUBQ 16(AX), BX 255 SHLQ $0x02, BX 256 MOVQ BX, 40(AX) 257 RET 258 259 // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) 260 TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 261 // Preload values 262 MOVQ ctx+0(FP), CX 263 MOVBQZX 8(CX), DI 264 MOVQ 16(CX), BX 265 MOVQ 48(CX), SI 266 MOVQ 24(CX), R8 267 MOVQ 32(CX), R9 268 MOVQ (CX), R10 269 270 // Main loop 271 main_loop: 272 XORL DX, DX 273 CMPQ BX, SI 274 SETGE DL 275 276 // br0.fillFast32() 277 MOVQ 32(R10), R11 278 MOVBQZX 40(R10), R12 279 CMPQ R12, $0x20 280 JBE skip_fill0 281 MOVQ 24(R10), R13 282 SUBQ $0x20, R12 283 SUBQ $0x04, R13 284 MOVQ (R10), R14 285 286 // b.value |= uint64(low) << (b.bitsRead & 63) 287 MOVL (R13)(R14*1), R14 288 MOVQ R12, CX 289 SHLQ CL, R14 290 MOVQ R13, 24(R10) 291 ORQ R14, R11 292 293 // exhausted += (br0.off < 4) 294 CMPQ R13, $0x04 295 ADCB $+0, DL 296 297 skip_fill0: 298 // val0 := br0.peekTopBits(peekBits) 299 MOVQ R11, R13 300 MOVQ DI, CX 301 SHRQ CL, R13 302 303 // v0 := table[val0&mask] 304 MOVW (R9)(R13*2), CX 305 306 // br0.advance(uint8(v0.entry) 307 MOVB CH, AL 308 SHLQ CL, R11 309 ADDB CL, R12 310 311 // val1 := br0.peekTopBits(peekBits) 312 MOVQ R11, R13 313 MOVQ DI, CX 314 SHRQ CL, R13 315 316 // v1 := table[val0&mask] 317 MOVW (R9)(R13*2), CX 318 319 // br0.advance(uint8(v1.entry) 320 MOVB CH, AH 321 SHLQ CL, R11 322 ADDB CL, R12 323 BSWAPL AX 324 325 // val2 := br0.peekTopBits(peekBits) 326 MOVQ R11, R13 327 MOVQ DI, CX 328 SHRQ CL, R13 329 330 // v2 := table[val0&mask] 331 MOVW (R9)(R13*2), CX 332 333 // br0.advance(uint8(v2.entry) 334 MOVB CH, AH 335 SHLQ CL, R11 336 ADDB CL, R12 337 338 // val3 := br0.peekTopBits(peekBits) 339 MOVQ R11, R13 340 MOVQ DI, CX 341 SHRQ CL, R13 342 343 // v3 := table[val0&mask] 344 MOVW (R9)(R13*2), CX 345 346 // br0.advance(uint8(v3.entry) 347 MOVB CH, AL 348 SHLQ CL, R11 349 ADDB CL, R12 350 BSWAPL AX 351 352 // these four writes get coalesced 353 // out[id * dstEvery + 0] = uint8(v0.entry >> 8) 354 // out[id * dstEvery + 1] = uint8(v1.entry >> 8) 355 // out[id * dstEvery + 3] = uint8(v2.entry >> 8) 356 // out[id * dstEvery + 4] = uint8(v3.entry >> 8) 357 MOVL AX, (BX) 358 359 // update the bitreader structure 360 MOVQ R11, 32(R10) 361 MOVB R12, 40(R10) 362 363 // br1.fillFast32() 364 MOVQ 80(R10), R11 365 MOVBQZX 88(R10), R12 366 CMPQ R12, $0x20 367 JBE skip_fill1 368 MOVQ 72(R10), R13 369 SUBQ $0x20, R12 370 SUBQ $0x04, R13 371 MOVQ 48(R10), R14 372 373 // b.value |= uint64(low) << (b.bitsRead & 63) 374 MOVL (R13)(R14*1), R14 375 MOVQ R12, CX 376 SHLQ CL, R14 377 MOVQ R13, 72(R10) 378 ORQ R14, R11 379 380 // exhausted += (br1.off < 4) 381 CMPQ R13, $0x04 382 ADCB $+0, DL 383 384 skip_fill1: 385 // val0 := br1.peekTopBits(peekBits) 386 MOVQ R11, R13 387 MOVQ DI, CX 388 SHRQ CL, R13 389 390 // v0 := table[val0&mask] 391 MOVW (R9)(R13*2), CX 392 393 // br1.advance(uint8(v0.entry) 394 MOVB CH, AL 395 SHLQ CL, R11 396 ADDB CL, R12 397 398 // val1 := br1.peekTopBits(peekBits) 399 MOVQ R11, R13 400 MOVQ DI, CX 401 SHRQ CL, R13 402 403 // v1 := table[val0&mask] 404 MOVW (R9)(R13*2), CX 405 406 // br1.advance(uint8(v1.entry) 407 MOVB CH, AH 408 SHLQ CL, R11 409 ADDB CL, R12 410 BSWAPL AX 411 412 // val2 := br1.peekTopBits(peekBits) 413 MOVQ R11, R13 414 MOVQ DI, CX 415 SHRQ CL, R13 416 417 // v2 := table[val0&mask] 418 MOVW (R9)(R13*2), CX 419 420 // br1.advance(uint8(v2.entry) 421 MOVB CH, AH 422 SHLQ CL, R11 423 ADDB CL, R12 424 425 // val3 := br1.peekTopBits(peekBits) 426 MOVQ R11, R13 427 MOVQ DI, CX 428 SHRQ CL, R13 429 430 // v3 := table[val0&mask] 431 MOVW (R9)(R13*2), CX 432 433 // br1.advance(uint8(v3.entry) 434 MOVB CH, AL 435 SHLQ CL, R11 436 ADDB CL, R12 437 BSWAPL AX 438 439 // these four writes get coalesced 440 // out[id * dstEvery + 0] = uint8(v0.entry >> 8) 441 // out[id * dstEvery + 1] = uint8(v1.entry >> 8) 442 // out[id * dstEvery + 3] = uint8(v2.entry >> 8) 443 // out[id * dstEvery + 4] = uint8(v3.entry >> 8) 444 MOVL AX, (BX)(R8*1) 445 446 // update the bitreader structure 447 MOVQ R11, 80(R10) 448 MOVB R12, 88(R10) 449 450 // br2.fillFast32() 451 MOVQ 128(R10), R11 452 MOVBQZX 136(R10), R12 453 CMPQ R12, $0x20 454 JBE skip_fill2 455 MOVQ 120(R10), R13 456 SUBQ $0x20, R12 457 SUBQ $0x04, R13 458 MOVQ 96(R10), R14 459 460 // b.value |= uint64(low) << (b.bitsRead & 63) 461 MOVL (R13)(R14*1), R14 462 MOVQ R12, CX 463 SHLQ CL, R14 464 MOVQ R13, 120(R10) 465 ORQ R14, R11 466 467 // exhausted += (br2.off < 4) 468 CMPQ R13, $0x04 469 ADCB $+0, DL 470 471 skip_fill2: 472 // val0 := br2.peekTopBits(peekBits) 473 MOVQ R11, R13 474 MOVQ DI, CX 475 SHRQ CL, R13 476 477 // v0 := table[val0&mask] 478 MOVW (R9)(R13*2), CX 479 480 // br2.advance(uint8(v0.entry) 481 MOVB CH, AL 482 SHLQ CL, R11 483 ADDB CL, R12 484 485 // val1 := br2.peekTopBits(peekBits) 486 MOVQ R11, R13 487 MOVQ DI, CX 488 SHRQ CL, R13 489 490 // v1 := table[val0&mask] 491 MOVW (R9)(R13*2), CX 492 493 // br2.advance(uint8(v1.entry) 494 MOVB CH, AH 495 SHLQ CL, R11 496 ADDB CL, R12 497 BSWAPL AX 498 499 // val2 := br2.peekTopBits(peekBits) 500 MOVQ R11, R13 501 MOVQ DI, CX 502 SHRQ CL, R13 503 504 // v2 := table[val0&mask] 505 MOVW (R9)(R13*2), CX 506 507 // br2.advance(uint8(v2.entry) 508 MOVB CH, AH 509 SHLQ CL, R11 510 ADDB CL, R12 511 512 // val3 := br2.peekTopBits(peekBits) 513 MOVQ R11, R13 514 MOVQ DI, CX 515 SHRQ CL, R13 516 517 // v3 := table[val0&mask] 518 MOVW (R9)(R13*2), CX 519 520 // br2.advance(uint8(v3.entry) 521 MOVB CH, AL 522 SHLQ CL, R11 523 ADDB CL, R12 524 BSWAPL AX 525 526 // these four writes get coalesced 527 // out[id * dstEvery + 0] = uint8(v0.entry >> 8) 528 // out[id * dstEvery + 1] = uint8(v1.entry >> 8) 529 // out[id * dstEvery + 3] = uint8(v2.entry >> 8) 530 // out[id * dstEvery + 4] = uint8(v3.entry >> 8) 531 MOVL AX, (BX)(R8*2) 532 533 // update the bitreader structure 534 MOVQ R11, 128(R10) 535 MOVB R12, 136(R10) 536 537 // br3.fillFast32() 538 MOVQ 176(R10), R11 539 MOVBQZX 184(R10), R12 540 CMPQ R12, $0x20 541 JBE skip_fill3 542 MOVQ 168(R10), R13 543 SUBQ $0x20, R12 544 SUBQ $0x04, R13 545 MOVQ 144(R10), R14 546 547 // b.value |= uint64(low) << (b.bitsRead & 63) 548 MOVL (R13)(R14*1), R14 549 MOVQ R12, CX 550 SHLQ CL, R14 551 MOVQ R13, 168(R10) 552 ORQ R14, R11 553 554 // exhausted += (br3.off < 4) 555 CMPQ R13, $0x04 556 ADCB $+0, DL 557 558 skip_fill3: 559 // val0 := br3.peekTopBits(peekBits) 560 MOVQ R11, R13 561 MOVQ DI, CX 562 SHRQ CL, R13 563 564 // v0 := table[val0&mask] 565 MOVW (R9)(R13*2), CX 566 567 // br3.advance(uint8(v0.entry) 568 MOVB CH, AL 569 SHLQ CL, R11 570 ADDB CL, R12 571 572 // val1 := br3.peekTopBits(peekBits) 573 MOVQ R11, R13 574 MOVQ DI, CX 575 SHRQ CL, R13 576 577 // v1 := table[val0&mask] 578 MOVW (R9)(R13*2), CX 579 580 // br3.advance(uint8(v1.entry) 581 MOVB CH, AH 582 SHLQ CL, R11 583 ADDB CL, R12 584 BSWAPL AX 585 586 // val2 := br3.peekTopBits(peekBits) 587 MOVQ R11, R13 588 MOVQ DI, CX 589 SHRQ CL, R13 590 591 // v2 := table[val0&mask] 592 MOVW (R9)(R13*2), CX 593 594 // br3.advance(uint8(v2.entry) 595 MOVB CH, AH 596 SHLQ CL, R11 597 ADDB CL, R12 598 599 // val3 := br3.peekTopBits(peekBits) 600 MOVQ R11, R13 601 MOVQ DI, CX 602 SHRQ CL, R13 603 604 // v3 := table[val0&mask] 605 MOVW (R9)(R13*2), CX 606 607 // br3.advance(uint8(v3.entry) 608 MOVB CH, AL 609 SHLQ CL, R11 610 ADDB CL, R12 611 BSWAPL AX 612 613 // these four writes get coalesced 614 // out[id * dstEvery + 0] = uint8(v0.entry >> 8) 615 // out[id * dstEvery + 1] = uint8(v1.entry >> 8) 616 // out[id * dstEvery + 3] = uint8(v2.entry >> 8) 617 // out[id * dstEvery + 4] = uint8(v3.entry >> 8) 618 LEAQ (R8)(R8*2), CX 619 MOVL AX, (BX)(CX*1) 620 621 // update the bitreader structure 622 MOVQ R11, 176(R10) 623 MOVB R12, 184(R10) 624 ADDQ $0x04, BX 625 TESTB DL, DL 626 JZ main_loop 627 MOVQ ctx+0(FP), AX 628 SUBQ 16(AX), BX 629 SHLQ $0x02, BX 630 MOVQ BX, 40(AX) 631 RET 632 633 // func decompress1x_main_loop_amd64(ctx *decompress1xContext) 634 TEXT ·decompress1x_main_loop_amd64(SB), $0-8 635 MOVQ ctx+0(FP), CX 636 MOVQ 16(CX), DX 637 MOVQ 24(CX), BX 638 CMPQ BX, $0x04 639 JB error_max_decoded_size_exceeded 640 LEAQ (DX)(BX*1), BX 641 MOVQ (CX), SI 642 MOVQ (SI), R8 643 MOVQ 24(SI), R9 644 MOVQ 32(SI), R10 645 MOVBQZX 40(SI), R11 646 MOVQ 32(CX), SI 647 MOVBQZX 8(CX), DI 648 JMP loop_condition 649 650 main_loop: 651 // Check if we have room for 4 bytes in the output buffer 652 LEAQ 4(DX), CX 653 CMPQ CX, BX 654 JGE error_max_decoded_size_exceeded 655 656 // Decode 4 values 657 CMPQ R11, $0x20 658 JL bitReader_fillFast_1_end 659 SUBQ $0x20, R11 660 SUBQ $0x04, R9 661 MOVL (R8)(R9*1), R12 662 MOVQ R11, CX 663 SHLQ CL, R12 664 ORQ R12, R10 665 666 bitReader_fillFast_1_end: 667 MOVQ DI, CX 668 MOVQ R10, R12 669 SHRQ CL, R12 670 MOVW (SI)(R12*2), CX 671 MOVB CH, AL 672 MOVBQZX CL, CX 673 ADDQ CX, R11 674 SHLQ CL, R10 675 MOVQ DI, CX 676 MOVQ R10, R12 677 SHRQ CL, R12 678 MOVW (SI)(R12*2), CX 679 MOVB CH, AH 680 MOVBQZX CL, CX 681 ADDQ CX, R11 682 SHLQ CL, R10 683 BSWAPL AX 684 CMPQ R11, $0x20 685 JL bitReader_fillFast_2_end 686 SUBQ $0x20, R11 687 SUBQ $0x04, R9 688 MOVL (R8)(R9*1), R12 689 MOVQ R11, CX 690 SHLQ CL, R12 691 ORQ R12, R10 692 693 bitReader_fillFast_2_end: 694 MOVQ DI, CX 695 MOVQ R10, R12 696 SHRQ CL, R12 697 MOVW (SI)(R12*2), CX 698 MOVB CH, AH 699 MOVBQZX CL, CX 700 ADDQ CX, R11 701 SHLQ CL, R10 702 MOVQ DI, CX 703 MOVQ R10, R12 704 SHRQ CL, R12 705 MOVW (SI)(R12*2), CX 706 MOVB CH, AL 707 MOVBQZX CL, CX 708 ADDQ CX, R11 709 SHLQ CL, R10 710 BSWAPL AX 711 712 // Store the decoded values 713 MOVL AX, (DX) 714 ADDQ $0x04, DX 715 716 loop_condition: 717 CMPQ R9, $0x08 718 JGE main_loop 719 720 // Update ctx structure 721 MOVQ ctx+0(FP), AX 722 SUBQ 16(AX), DX 723 MOVQ DX, 40(AX) 724 MOVQ (AX), AX 725 MOVQ R9, 24(AX) 726 MOVQ R10, 32(AX) 727 MOVB R11, 40(AX) 728 RET 729 730 // Report error 731 error_max_decoded_size_exceeded: 732 MOVQ ctx+0(FP), AX 733 MOVQ $-1, CX 734 MOVQ CX, 40(AX) 735 RET 736 737 // func decompress1x_main_loop_bmi2(ctx *decompress1xContext) 738 // Requires: BMI2 739 TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 740 MOVQ ctx+0(FP), CX 741 MOVQ 16(CX), DX 742 MOVQ 24(CX), BX 743 CMPQ BX, $0x04 744 JB error_max_decoded_size_exceeded 745 LEAQ (DX)(BX*1), BX 746 MOVQ (CX), SI 747 MOVQ (SI), R8 748 MOVQ 24(SI), R9 749 MOVQ 32(SI), R10 750 MOVBQZX 40(SI), R11 751 MOVQ 32(CX), SI 752 MOVBQZX 8(CX), DI 753 JMP loop_condition 754 755 main_loop: 756 // Check if we have room for 4 bytes in the output buffer 757 LEAQ 4(DX), CX 758 CMPQ CX, BX 759 JGE error_max_decoded_size_exceeded 760 761 // Decode 4 values 762 CMPQ R11, $0x20 763 JL bitReader_fillFast_1_end 764 SUBQ $0x20, R11 765 SUBQ $0x04, R9 766 MOVL (R8)(R9*1), CX 767 SHLXQ R11, CX, CX 768 ORQ CX, R10 769 770 bitReader_fillFast_1_end: 771 SHRXQ DI, R10, CX 772 MOVW (SI)(CX*2), CX 773 MOVB CH, AL 774 MOVBQZX CL, CX 775 ADDQ CX, R11 776 SHLXQ CX, R10, R10 777 SHRXQ DI, R10, CX 778 MOVW (SI)(CX*2), CX 779 MOVB CH, AH 780 MOVBQZX CL, CX 781 ADDQ CX, R11 782 SHLXQ CX, R10, R10 783 BSWAPL AX 784 CMPQ R11, $0x20 785 JL bitReader_fillFast_2_end 786 SUBQ $0x20, R11 787 SUBQ $0x04, R9 788 MOVL (R8)(R9*1), CX 789 SHLXQ R11, CX, CX 790 ORQ CX, R10 791 792 bitReader_fillFast_2_end: 793 SHRXQ DI, R10, CX 794 MOVW (SI)(CX*2), CX 795 MOVB CH, AH 796 MOVBQZX CL, CX 797 ADDQ CX, R11 798 SHLXQ CX, R10, R10 799 SHRXQ DI, R10, CX 800 MOVW (SI)(CX*2), CX 801 MOVB CH, AL 802 MOVBQZX CL, CX 803 ADDQ CX, R11 804 SHLXQ CX, R10, R10 805 BSWAPL AX 806 807 // Store the decoded values 808 MOVL AX, (DX) 809 ADDQ $0x04, DX 810 811 loop_condition: 812 CMPQ R9, $0x08 813 JGE main_loop 814 815 // Update ctx structure 816 MOVQ ctx+0(FP), AX 817 SUBQ 16(AX), DX 818 MOVQ DX, 40(AX) 819 MOVQ (AX), AX 820 MOVQ R9, 24(AX) 821 MOVQ R10, 32(AX) 822 MOVB R11, 40(AX) 823 RET 824 825 // Report error 826 error_max_decoded_size_exceeded: 827 MOVQ ctx+0(FP), AX 828 MOVQ $-1, CX 829 MOVQ CX, 40(AX) 830 RET