github.com/bir3/gocompiler@v0.9.2202/extra/compress/zstd/seqdec_amd64.s (about) 1 // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. 2 3 //go:build !appengine && !noasm && gc && !noasm 4 5 // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int 6 // Requires: CMOV 7 TEXT ·sequenceDecs_decode_amd64(SB), $8-32 8 MOVQ br+8(FP), AX 9 MOVQ 32(AX), DX 10 MOVBQZX 40(AX), BX 11 MOVQ 24(AX), SI 12 MOVQ (AX), AX 13 ADDQ SI, AX 14 MOVQ AX, (SP) 15 MOVQ ctx+16(FP), AX 16 MOVQ 72(AX), DI 17 MOVQ 80(AX), R8 18 MOVQ 88(AX), R9 19 MOVQ 104(AX), R10 20 MOVQ s+0(FP), AX 21 MOVQ 144(AX), R11 22 MOVQ 152(AX), R12 23 MOVQ 160(AX), R13 24 25 sequenceDecs_decode_amd64_main_loop: 26 MOVQ (SP), R14 27 28 // Fill bitreader to have enough for the offset and match length. 29 CMPQ SI, $0x08 30 JL sequenceDecs_decode_amd64_fill_byte_by_byte 31 MOVQ BX, AX 32 SHRQ $0x03, AX 33 SUBQ AX, R14 34 MOVQ (R14), DX 35 SUBQ AX, SI 36 ANDQ $0x07, BX 37 JMP sequenceDecs_decode_amd64_fill_end 38 39 sequenceDecs_decode_amd64_fill_byte_by_byte: 40 CMPQ SI, $0x00 41 JLE sequenceDecs_decode_amd64_fill_check_overread 42 CMPQ BX, $0x07 43 JLE sequenceDecs_decode_amd64_fill_end 44 SHLQ $0x08, DX 45 SUBQ $0x01, R14 46 SUBQ $0x01, SI 47 SUBQ $0x08, BX 48 MOVBQZX (R14), AX 49 ORQ AX, DX 50 JMP sequenceDecs_decode_amd64_fill_byte_by_byte 51 52 sequenceDecs_decode_amd64_fill_check_overread: 53 CMPQ BX, $0x40 54 JA error_overread 55 56 sequenceDecs_decode_amd64_fill_end: 57 // Update offset 58 MOVQ R9, AX 59 MOVQ BX, CX 60 MOVQ DX, R15 61 SHLQ CL, R15 62 MOVB AH, CL 63 SHRQ $0x20, AX 64 TESTQ CX, CX 65 JZ sequenceDecs_decode_amd64_of_update_zero 66 ADDQ CX, BX 67 CMPQ BX, $0x40 68 JA sequenceDecs_decode_amd64_of_update_zero 69 CMPQ CX, $0x40 70 JAE sequenceDecs_decode_amd64_of_update_zero 71 NEGQ CX 72 SHRQ CL, R15 73 ADDQ R15, AX 74 75 sequenceDecs_decode_amd64_of_update_zero: 76 MOVQ AX, 16(R10) 77 78 // Update match length 79 MOVQ R8, AX 80 MOVQ BX, CX 81 MOVQ DX, R15 82 SHLQ CL, R15 83 MOVB AH, CL 84 SHRQ $0x20, AX 85 TESTQ CX, CX 86 JZ sequenceDecs_decode_amd64_ml_update_zero 87 ADDQ CX, BX 88 CMPQ BX, $0x40 89 JA sequenceDecs_decode_amd64_ml_update_zero 90 CMPQ CX, $0x40 91 JAE sequenceDecs_decode_amd64_ml_update_zero 92 NEGQ CX 93 SHRQ CL, R15 94 ADDQ R15, AX 95 96 sequenceDecs_decode_amd64_ml_update_zero: 97 MOVQ AX, 8(R10) 98 99 // Fill bitreader to have enough for the remaining 100 CMPQ SI, $0x08 101 JL sequenceDecs_decode_amd64_fill_2_byte_by_byte 102 MOVQ BX, AX 103 SHRQ $0x03, AX 104 SUBQ AX, R14 105 MOVQ (R14), DX 106 SUBQ AX, SI 107 ANDQ $0x07, BX 108 JMP sequenceDecs_decode_amd64_fill_2_end 109 110 sequenceDecs_decode_amd64_fill_2_byte_by_byte: 111 CMPQ SI, $0x00 112 JLE sequenceDecs_decode_amd64_fill_2_check_overread 113 CMPQ BX, $0x07 114 JLE sequenceDecs_decode_amd64_fill_2_end 115 SHLQ $0x08, DX 116 SUBQ $0x01, R14 117 SUBQ $0x01, SI 118 SUBQ $0x08, BX 119 MOVBQZX (R14), AX 120 ORQ AX, DX 121 JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte 122 123 sequenceDecs_decode_amd64_fill_2_check_overread: 124 CMPQ BX, $0x40 125 JA error_overread 126 127 sequenceDecs_decode_amd64_fill_2_end: 128 // Update literal length 129 MOVQ DI, AX 130 MOVQ BX, CX 131 MOVQ DX, R15 132 SHLQ CL, R15 133 MOVB AH, CL 134 SHRQ $0x20, AX 135 TESTQ CX, CX 136 JZ sequenceDecs_decode_amd64_ll_update_zero 137 ADDQ CX, BX 138 CMPQ BX, $0x40 139 JA sequenceDecs_decode_amd64_ll_update_zero 140 CMPQ CX, $0x40 141 JAE sequenceDecs_decode_amd64_ll_update_zero 142 NEGQ CX 143 SHRQ CL, R15 144 ADDQ R15, AX 145 146 sequenceDecs_decode_amd64_ll_update_zero: 147 MOVQ AX, (R10) 148 149 // Fill bitreader for state updates 150 MOVQ R14, (SP) 151 MOVQ R9, AX 152 SHRQ $0x08, AX 153 MOVBQZX AL, AX 154 MOVQ ctx+16(FP), CX 155 CMPQ 96(CX), $0x00 156 JZ sequenceDecs_decode_amd64_skip_update 157 158 // Update Literal Length State 159 MOVBQZX DI, R14 160 SHRQ $0x10, DI 161 MOVWQZX DI, DI 162 LEAQ (BX)(R14*1), CX 163 MOVQ DX, R15 164 MOVQ CX, BX 165 ROLQ CL, R15 166 MOVL $0x00000001, BP 167 MOVB R14, CL 168 SHLL CL, BP 169 DECL BP 170 ANDQ BP, R15 171 ADDQ R15, DI 172 173 // Load ctx.llTable 174 MOVQ ctx+16(FP), CX 175 MOVQ (CX), CX 176 MOVQ (CX)(DI*8), DI 177 178 // Update Match Length State 179 MOVBQZX R8, R14 180 SHRQ $0x10, R8 181 MOVWQZX R8, R8 182 LEAQ (BX)(R14*1), CX 183 MOVQ DX, R15 184 MOVQ CX, BX 185 ROLQ CL, R15 186 MOVL $0x00000001, BP 187 MOVB R14, CL 188 SHLL CL, BP 189 DECL BP 190 ANDQ BP, R15 191 ADDQ R15, R8 192 193 // Load ctx.mlTable 194 MOVQ ctx+16(FP), CX 195 MOVQ 24(CX), CX 196 MOVQ (CX)(R8*8), R8 197 198 // Update Offset State 199 MOVBQZX R9, R14 200 SHRQ $0x10, R9 201 MOVWQZX R9, R9 202 LEAQ (BX)(R14*1), CX 203 MOVQ DX, R15 204 MOVQ CX, BX 205 ROLQ CL, R15 206 MOVL $0x00000001, BP 207 MOVB R14, CL 208 SHLL CL, BP 209 DECL BP 210 ANDQ BP, R15 211 ADDQ R15, R9 212 213 // Load ctx.ofTable 214 MOVQ ctx+16(FP), CX 215 MOVQ 48(CX), CX 216 MOVQ (CX)(R9*8), R9 217 218 sequenceDecs_decode_amd64_skip_update: 219 // Adjust offset 220 MOVQ 16(R10), CX 221 CMPQ AX, $0x01 222 JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 223 MOVQ R12, R13 224 MOVQ R11, R12 225 MOVQ CX, R11 226 JMP sequenceDecs_decode_amd64_after_adjust 227 228 sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: 229 CMPQ (R10), $0x00000000 230 JNE sequenceDecs_decode_amd64_adjust_offset_maybezero 231 INCQ CX 232 JMP sequenceDecs_decode_amd64_adjust_offset_nonzero 233 234 sequenceDecs_decode_amd64_adjust_offset_maybezero: 235 TESTQ CX, CX 236 JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero 237 MOVQ R11, CX 238 JMP sequenceDecs_decode_amd64_after_adjust 239 240 sequenceDecs_decode_amd64_adjust_offset_nonzero: 241 CMPQ CX, $0x01 242 JB sequenceDecs_decode_amd64_adjust_zero 243 JEQ sequenceDecs_decode_amd64_adjust_one 244 CMPQ CX, $0x02 245 JA sequenceDecs_decode_amd64_adjust_three 246 JMP sequenceDecs_decode_amd64_adjust_two 247 248 sequenceDecs_decode_amd64_adjust_zero: 249 MOVQ R11, AX 250 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid 251 252 sequenceDecs_decode_amd64_adjust_one: 253 MOVQ R12, AX 254 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid 255 256 sequenceDecs_decode_amd64_adjust_two: 257 MOVQ R13, AX 258 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid 259 260 sequenceDecs_decode_amd64_adjust_three: 261 LEAQ -1(R11), AX 262 263 sequenceDecs_decode_amd64_adjust_test_temp_valid: 264 TESTQ AX, AX 265 JNZ sequenceDecs_decode_amd64_adjust_temp_valid 266 MOVQ $0x00000001, AX 267 268 sequenceDecs_decode_amd64_adjust_temp_valid: 269 CMPQ CX, $0x01 270 CMOVQNE R12, R13 271 MOVQ R11, R12 272 MOVQ AX, R11 273 MOVQ AX, CX 274 275 sequenceDecs_decode_amd64_after_adjust: 276 MOVQ CX, 16(R10) 277 278 // Check values 279 MOVQ 8(R10), AX 280 MOVQ (R10), R14 281 LEAQ (AX)(R14*1), R15 282 MOVQ s+0(FP), BP 283 ADDQ R15, 256(BP) 284 MOVQ ctx+16(FP), R15 285 SUBQ R14, 128(R15) 286 JS error_not_enough_literals 287 CMPQ AX, $0x00020002 288 JA sequenceDecs_decode_amd64_error_match_len_too_big 289 TESTQ CX, CX 290 JNZ sequenceDecs_decode_amd64_match_len_ofs_ok 291 TESTQ AX, AX 292 JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch 293 294 sequenceDecs_decode_amd64_match_len_ofs_ok: 295 ADDQ $0x18, R10 296 MOVQ ctx+16(FP), AX 297 DECQ 96(AX) 298 JNS sequenceDecs_decode_amd64_main_loop 299 MOVQ s+0(FP), AX 300 MOVQ R11, 144(AX) 301 MOVQ R12, 152(AX) 302 MOVQ R13, 160(AX) 303 MOVQ br+8(FP), AX 304 MOVQ DX, 32(AX) 305 MOVB BL, 40(AX) 306 MOVQ SI, 24(AX) 307 308 // Return success 309 MOVQ $0x00000000, ret+24(FP) 310 RET 311 312 // Return with match length error 313 sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: 314 MOVQ $0x00000001, ret+24(FP) 315 RET 316 317 // Return with match too long error 318 sequenceDecs_decode_amd64_error_match_len_too_big: 319 MOVQ $0x00000002, ret+24(FP) 320 RET 321 322 // Return with match offset too long error 323 MOVQ $0x00000003, ret+24(FP) 324 RET 325 326 // Return with not enough literals error 327 error_not_enough_literals: 328 MOVQ $0x00000004, ret+24(FP) 329 RET 330 331 // Return with overread error 332 error_overread: 333 MOVQ $0x00000006, ret+24(FP) 334 RET 335 336 // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int 337 // Requires: CMOV 338 TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 339 MOVQ br+8(FP), AX 340 MOVQ 32(AX), DX 341 MOVBQZX 40(AX), BX 342 MOVQ 24(AX), SI 343 MOVQ (AX), AX 344 ADDQ SI, AX 345 MOVQ AX, (SP) 346 MOVQ ctx+16(FP), AX 347 MOVQ 72(AX), DI 348 MOVQ 80(AX), R8 349 MOVQ 88(AX), R9 350 MOVQ 104(AX), R10 351 MOVQ s+0(FP), AX 352 MOVQ 144(AX), R11 353 MOVQ 152(AX), R12 354 MOVQ 160(AX), R13 355 356 sequenceDecs_decode_56_amd64_main_loop: 357 MOVQ (SP), R14 358 359 // Fill bitreader to have enough for the offset and match length. 360 CMPQ SI, $0x08 361 JL sequenceDecs_decode_56_amd64_fill_byte_by_byte 362 MOVQ BX, AX 363 SHRQ $0x03, AX 364 SUBQ AX, R14 365 MOVQ (R14), DX 366 SUBQ AX, SI 367 ANDQ $0x07, BX 368 JMP sequenceDecs_decode_56_amd64_fill_end 369 370 sequenceDecs_decode_56_amd64_fill_byte_by_byte: 371 CMPQ SI, $0x00 372 JLE sequenceDecs_decode_56_amd64_fill_check_overread 373 CMPQ BX, $0x07 374 JLE sequenceDecs_decode_56_amd64_fill_end 375 SHLQ $0x08, DX 376 SUBQ $0x01, R14 377 SUBQ $0x01, SI 378 SUBQ $0x08, BX 379 MOVBQZX (R14), AX 380 ORQ AX, DX 381 JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte 382 383 sequenceDecs_decode_56_amd64_fill_check_overread: 384 CMPQ BX, $0x40 385 JA error_overread 386 387 sequenceDecs_decode_56_amd64_fill_end: 388 // Update offset 389 MOVQ R9, AX 390 MOVQ BX, CX 391 MOVQ DX, R15 392 SHLQ CL, R15 393 MOVB AH, CL 394 SHRQ $0x20, AX 395 TESTQ CX, CX 396 JZ sequenceDecs_decode_56_amd64_of_update_zero 397 ADDQ CX, BX 398 CMPQ BX, $0x40 399 JA sequenceDecs_decode_56_amd64_of_update_zero 400 CMPQ CX, $0x40 401 JAE sequenceDecs_decode_56_amd64_of_update_zero 402 NEGQ CX 403 SHRQ CL, R15 404 ADDQ R15, AX 405 406 sequenceDecs_decode_56_amd64_of_update_zero: 407 MOVQ AX, 16(R10) 408 409 // Update match length 410 MOVQ R8, AX 411 MOVQ BX, CX 412 MOVQ DX, R15 413 SHLQ CL, R15 414 MOVB AH, CL 415 SHRQ $0x20, AX 416 TESTQ CX, CX 417 JZ sequenceDecs_decode_56_amd64_ml_update_zero 418 ADDQ CX, BX 419 CMPQ BX, $0x40 420 JA sequenceDecs_decode_56_amd64_ml_update_zero 421 CMPQ CX, $0x40 422 JAE sequenceDecs_decode_56_amd64_ml_update_zero 423 NEGQ CX 424 SHRQ CL, R15 425 ADDQ R15, AX 426 427 sequenceDecs_decode_56_amd64_ml_update_zero: 428 MOVQ AX, 8(R10) 429 430 // Update literal length 431 MOVQ DI, AX 432 MOVQ BX, CX 433 MOVQ DX, R15 434 SHLQ CL, R15 435 MOVB AH, CL 436 SHRQ $0x20, AX 437 TESTQ CX, CX 438 JZ sequenceDecs_decode_56_amd64_ll_update_zero 439 ADDQ CX, BX 440 CMPQ BX, $0x40 441 JA sequenceDecs_decode_56_amd64_ll_update_zero 442 CMPQ CX, $0x40 443 JAE sequenceDecs_decode_56_amd64_ll_update_zero 444 NEGQ CX 445 SHRQ CL, R15 446 ADDQ R15, AX 447 448 sequenceDecs_decode_56_amd64_ll_update_zero: 449 MOVQ AX, (R10) 450 451 // Fill bitreader for state updates 452 MOVQ R14, (SP) 453 MOVQ R9, AX 454 SHRQ $0x08, AX 455 MOVBQZX AL, AX 456 MOVQ ctx+16(FP), CX 457 CMPQ 96(CX), $0x00 458 JZ sequenceDecs_decode_56_amd64_skip_update 459 460 // Update Literal Length State 461 MOVBQZX DI, R14 462 SHRQ $0x10, DI 463 MOVWQZX DI, DI 464 LEAQ (BX)(R14*1), CX 465 MOVQ DX, R15 466 MOVQ CX, BX 467 ROLQ CL, R15 468 MOVL $0x00000001, BP 469 MOVB R14, CL 470 SHLL CL, BP 471 DECL BP 472 ANDQ BP, R15 473 ADDQ R15, DI 474 475 // Load ctx.llTable 476 MOVQ ctx+16(FP), CX 477 MOVQ (CX), CX 478 MOVQ (CX)(DI*8), DI 479 480 // Update Match Length State 481 MOVBQZX R8, R14 482 SHRQ $0x10, R8 483 MOVWQZX R8, R8 484 LEAQ (BX)(R14*1), CX 485 MOVQ DX, R15 486 MOVQ CX, BX 487 ROLQ CL, R15 488 MOVL $0x00000001, BP 489 MOVB R14, CL 490 SHLL CL, BP 491 DECL BP 492 ANDQ BP, R15 493 ADDQ R15, R8 494 495 // Load ctx.mlTable 496 MOVQ ctx+16(FP), CX 497 MOVQ 24(CX), CX 498 MOVQ (CX)(R8*8), R8 499 500 // Update Offset State 501 MOVBQZX R9, R14 502 SHRQ $0x10, R9 503 MOVWQZX R9, R9 504 LEAQ (BX)(R14*1), CX 505 MOVQ DX, R15 506 MOVQ CX, BX 507 ROLQ CL, R15 508 MOVL $0x00000001, BP 509 MOVB R14, CL 510 SHLL CL, BP 511 DECL BP 512 ANDQ BP, R15 513 ADDQ R15, R9 514 515 // Load ctx.ofTable 516 MOVQ ctx+16(FP), CX 517 MOVQ 48(CX), CX 518 MOVQ (CX)(R9*8), R9 519 520 sequenceDecs_decode_56_amd64_skip_update: 521 // Adjust offset 522 MOVQ 16(R10), CX 523 CMPQ AX, $0x01 524 JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 525 MOVQ R12, R13 526 MOVQ R11, R12 527 MOVQ CX, R11 528 JMP sequenceDecs_decode_56_amd64_after_adjust 529 530 sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: 531 CMPQ (R10), $0x00000000 532 JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero 533 INCQ CX 534 JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero 535 536 sequenceDecs_decode_56_amd64_adjust_offset_maybezero: 537 TESTQ CX, CX 538 JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero 539 MOVQ R11, CX 540 JMP sequenceDecs_decode_56_amd64_after_adjust 541 542 sequenceDecs_decode_56_amd64_adjust_offset_nonzero: 543 CMPQ CX, $0x01 544 JB sequenceDecs_decode_56_amd64_adjust_zero 545 JEQ sequenceDecs_decode_56_amd64_adjust_one 546 CMPQ CX, $0x02 547 JA sequenceDecs_decode_56_amd64_adjust_three 548 JMP sequenceDecs_decode_56_amd64_adjust_two 549 550 sequenceDecs_decode_56_amd64_adjust_zero: 551 MOVQ R11, AX 552 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid 553 554 sequenceDecs_decode_56_amd64_adjust_one: 555 MOVQ R12, AX 556 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid 557 558 sequenceDecs_decode_56_amd64_adjust_two: 559 MOVQ R13, AX 560 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid 561 562 sequenceDecs_decode_56_amd64_adjust_three: 563 LEAQ -1(R11), AX 564 565 sequenceDecs_decode_56_amd64_adjust_test_temp_valid: 566 TESTQ AX, AX 567 JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid 568 MOVQ $0x00000001, AX 569 570 sequenceDecs_decode_56_amd64_adjust_temp_valid: 571 CMPQ CX, $0x01 572 CMOVQNE R12, R13 573 MOVQ R11, R12 574 MOVQ AX, R11 575 MOVQ AX, CX 576 577 sequenceDecs_decode_56_amd64_after_adjust: 578 MOVQ CX, 16(R10) 579 580 // Check values 581 MOVQ 8(R10), AX 582 MOVQ (R10), R14 583 LEAQ (AX)(R14*1), R15 584 MOVQ s+0(FP), BP 585 ADDQ R15, 256(BP) 586 MOVQ ctx+16(FP), R15 587 SUBQ R14, 128(R15) 588 JS error_not_enough_literals 589 CMPQ AX, $0x00020002 590 JA sequenceDecs_decode_56_amd64_error_match_len_too_big 591 TESTQ CX, CX 592 JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok 593 TESTQ AX, AX 594 JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch 595 596 sequenceDecs_decode_56_amd64_match_len_ofs_ok: 597 ADDQ $0x18, R10 598 MOVQ ctx+16(FP), AX 599 DECQ 96(AX) 600 JNS sequenceDecs_decode_56_amd64_main_loop 601 MOVQ s+0(FP), AX 602 MOVQ R11, 144(AX) 603 MOVQ R12, 152(AX) 604 MOVQ R13, 160(AX) 605 MOVQ br+8(FP), AX 606 MOVQ DX, 32(AX) 607 MOVB BL, 40(AX) 608 MOVQ SI, 24(AX) 609 610 // Return success 611 MOVQ $0x00000000, ret+24(FP) 612 RET 613 614 // Return with match length error 615 sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: 616 MOVQ $0x00000001, ret+24(FP) 617 RET 618 619 // Return with match too long error 620 sequenceDecs_decode_56_amd64_error_match_len_too_big: 621 MOVQ $0x00000002, ret+24(FP) 622 RET 623 624 // Return with match offset too long error 625 MOVQ $0x00000003, ret+24(FP) 626 RET 627 628 // Return with not enough literals error 629 error_not_enough_literals: 630 MOVQ $0x00000004, ret+24(FP) 631 RET 632 633 // Return with overread error 634 error_overread: 635 MOVQ $0x00000006, ret+24(FP) 636 RET 637 638 // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int 639 // Requires: BMI, BMI2, CMOV 640 TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 641 MOVQ br+8(FP), CX 642 MOVQ 32(CX), AX 643 MOVBQZX 40(CX), DX 644 MOVQ 24(CX), BX 645 MOVQ (CX), CX 646 ADDQ BX, CX 647 MOVQ CX, (SP) 648 MOVQ ctx+16(FP), CX 649 MOVQ 72(CX), SI 650 MOVQ 80(CX), DI 651 MOVQ 88(CX), R8 652 MOVQ 104(CX), R9 653 MOVQ s+0(FP), CX 654 MOVQ 144(CX), R10 655 MOVQ 152(CX), R11 656 MOVQ 160(CX), R12 657 658 sequenceDecs_decode_bmi2_main_loop: 659 MOVQ (SP), R13 660 661 // Fill bitreader to have enough for the offset and match length. 662 CMPQ BX, $0x08 663 JL sequenceDecs_decode_bmi2_fill_byte_by_byte 664 MOVQ DX, CX 665 SHRQ $0x03, CX 666 SUBQ CX, R13 667 MOVQ (R13), AX 668 SUBQ CX, BX 669 ANDQ $0x07, DX 670 JMP sequenceDecs_decode_bmi2_fill_end 671 672 sequenceDecs_decode_bmi2_fill_byte_by_byte: 673 CMPQ BX, $0x00 674 JLE sequenceDecs_decode_bmi2_fill_check_overread 675 CMPQ DX, $0x07 676 JLE sequenceDecs_decode_bmi2_fill_end 677 SHLQ $0x08, AX 678 SUBQ $0x01, R13 679 SUBQ $0x01, BX 680 SUBQ $0x08, DX 681 MOVBQZX (R13), CX 682 ORQ CX, AX 683 JMP sequenceDecs_decode_bmi2_fill_byte_by_byte 684 685 sequenceDecs_decode_bmi2_fill_check_overread: 686 CMPQ DX, $0x40 687 JA error_overread 688 689 sequenceDecs_decode_bmi2_fill_end: 690 // Update offset 691 MOVQ $0x00000808, CX 692 BEXTRQ CX, R8, R14 693 MOVQ AX, R15 694 LEAQ (DX)(R14*1), CX 695 ROLQ CL, R15 696 BZHIQ R14, R15, R15 697 MOVQ CX, DX 698 MOVQ R8, CX 699 SHRQ $0x20, CX 700 ADDQ R15, CX 701 MOVQ CX, 16(R9) 702 703 // Update match length 704 MOVQ $0x00000808, CX 705 BEXTRQ CX, DI, R14 706 MOVQ AX, R15 707 LEAQ (DX)(R14*1), CX 708 ROLQ CL, R15 709 BZHIQ R14, R15, R15 710 MOVQ CX, DX 711 MOVQ DI, CX 712 SHRQ $0x20, CX 713 ADDQ R15, CX 714 MOVQ CX, 8(R9) 715 716 // Fill bitreader to have enough for the remaining 717 CMPQ BX, $0x08 718 JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte 719 MOVQ DX, CX 720 SHRQ $0x03, CX 721 SUBQ CX, R13 722 MOVQ (R13), AX 723 SUBQ CX, BX 724 ANDQ $0x07, DX 725 JMP sequenceDecs_decode_bmi2_fill_2_end 726 727 sequenceDecs_decode_bmi2_fill_2_byte_by_byte: 728 CMPQ BX, $0x00 729 JLE sequenceDecs_decode_bmi2_fill_2_check_overread 730 CMPQ DX, $0x07 731 JLE sequenceDecs_decode_bmi2_fill_2_end 732 SHLQ $0x08, AX 733 SUBQ $0x01, R13 734 SUBQ $0x01, BX 735 SUBQ $0x08, DX 736 MOVBQZX (R13), CX 737 ORQ CX, AX 738 JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte 739 740 sequenceDecs_decode_bmi2_fill_2_check_overread: 741 CMPQ DX, $0x40 742 JA error_overread 743 744 sequenceDecs_decode_bmi2_fill_2_end: 745 // Update literal length 746 MOVQ $0x00000808, CX 747 BEXTRQ CX, SI, R14 748 MOVQ AX, R15 749 LEAQ (DX)(R14*1), CX 750 ROLQ CL, R15 751 BZHIQ R14, R15, R15 752 MOVQ CX, DX 753 MOVQ SI, CX 754 SHRQ $0x20, CX 755 ADDQ R15, CX 756 MOVQ CX, (R9) 757 758 // Fill bitreader for state updates 759 MOVQ R13, (SP) 760 MOVQ $0x00000808, CX 761 BEXTRQ CX, R8, R13 762 MOVQ ctx+16(FP), CX 763 CMPQ 96(CX), $0x00 764 JZ sequenceDecs_decode_bmi2_skip_update 765 LEAQ (SI)(DI*1), R14 766 ADDQ R8, R14 767 MOVBQZX R14, R14 768 LEAQ (DX)(R14*1), CX 769 MOVQ AX, R15 770 MOVQ CX, DX 771 ROLQ CL, R15 772 BZHIQ R14, R15, R15 773 774 // Update Offset State 775 BZHIQ R8, R15, CX 776 SHRXQ R8, R15, R15 777 MOVQ $0x00001010, R14 778 BEXTRQ R14, R8, R8 779 ADDQ CX, R8 780 781 // Load ctx.ofTable 782 MOVQ ctx+16(FP), CX 783 MOVQ 48(CX), CX 784 MOVQ (CX)(R8*8), R8 785 786 // Update Match Length State 787 BZHIQ DI, R15, CX 788 SHRXQ DI, R15, R15 789 MOVQ $0x00001010, R14 790 BEXTRQ R14, DI, DI 791 ADDQ CX, DI 792 793 // Load ctx.mlTable 794 MOVQ ctx+16(FP), CX 795 MOVQ 24(CX), CX 796 MOVQ (CX)(DI*8), DI 797 798 // Update Literal Length State 799 BZHIQ SI, R15, CX 800 MOVQ $0x00001010, R14 801 BEXTRQ R14, SI, SI 802 ADDQ CX, SI 803 804 // Load ctx.llTable 805 MOVQ ctx+16(FP), CX 806 MOVQ (CX), CX 807 MOVQ (CX)(SI*8), SI 808 809 sequenceDecs_decode_bmi2_skip_update: 810 // Adjust offset 811 MOVQ 16(R9), CX 812 CMPQ R13, $0x01 813 JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 814 MOVQ R11, R12 815 MOVQ R10, R11 816 MOVQ CX, R10 817 JMP sequenceDecs_decode_bmi2_after_adjust 818 819 sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: 820 CMPQ (R9), $0x00000000 821 JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero 822 INCQ CX 823 JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero 824 825 sequenceDecs_decode_bmi2_adjust_offset_maybezero: 826 TESTQ CX, CX 827 JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero 828 MOVQ R10, CX 829 JMP sequenceDecs_decode_bmi2_after_adjust 830 831 sequenceDecs_decode_bmi2_adjust_offset_nonzero: 832 CMPQ CX, $0x01 833 JB sequenceDecs_decode_bmi2_adjust_zero 834 JEQ sequenceDecs_decode_bmi2_adjust_one 835 CMPQ CX, $0x02 836 JA sequenceDecs_decode_bmi2_adjust_three 837 JMP sequenceDecs_decode_bmi2_adjust_two 838 839 sequenceDecs_decode_bmi2_adjust_zero: 840 MOVQ R10, R13 841 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid 842 843 sequenceDecs_decode_bmi2_adjust_one: 844 MOVQ R11, R13 845 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid 846 847 sequenceDecs_decode_bmi2_adjust_two: 848 MOVQ R12, R13 849 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid 850 851 sequenceDecs_decode_bmi2_adjust_three: 852 LEAQ -1(R10), R13 853 854 sequenceDecs_decode_bmi2_adjust_test_temp_valid: 855 TESTQ R13, R13 856 JNZ sequenceDecs_decode_bmi2_adjust_temp_valid 857 MOVQ $0x00000001, R13 858 859 sequenceDecs_decode_bmi2_adjust_temp_valid: 860 CMPQ CX, $0x01 861 CMOVQNE R11, R12 862 MOVQ R10, R11 863 MOVQ R13, R10 864 MOVQ R13, CX 865 866 sequenceDecs_decode_bmi2_after_adjust: 867 MOVQ CX, 16(R9) 868 869 // Check values 870 MOVQ 8(R9), R13 871 MOVQ (R9), R14 872 LEAQ (R13)(R14*1), R15 873 MOVQ s+0(FP), BP 874 ADDQ R15, 256(BP) 875 MOVQ ctx+16(FP), R15 876 SUBQ R14, 128(R15) 877 JS error_not_enough_literals 878 CMPQ R13, $0x00020002 879 JA sequenceDecs_decode_bmi2_error_match_len_too_big 880 TESTQ CX, CX 881 JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok 882 TESTQ R13, R13 883 JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch 884 885 sequenceDecs_decode_bmi2_match_len_ofs_ok: 886 ADDQ $0x18, R9 887 MOVQ ctx+16(FP), CX 888 DECQ 96(CX) 889 JNS sequenceDecs_decode_bmi2_main_loop 890 MOVQ s+0(FP), CX 891 MOVQ R10, 144(CX) 892 MOVQ R11, 152(CX) 893 MOVQ R12, 160(CX) 894 MOVQ br+8(FP), CX 895 MOVQ AX, 32(CX) 896 MOVB DL, 40(CX) 897 MOVQ BX, 24(CX) 898 899 // Return success 900 MOVQ $0x00000000, ret+24(FP) 901 RET 902 903 // Return with match length error 904 sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: 905 MOVQ $0x00000001, ret+24(FP) 906 RET 907 908 // Return with match too long error 909 sequenceDecs_decode_bmi2_error_match_len_too_big: 910 MOVQ $0x00000002, ret+24(FP) 911 RET 912 913 // Return with match offset too long error 914 MOVQ $0x00000003, ret+24(FP) 915 RET 916 917 // Return with not enough literals error 918 error_not_enough_literals: 919 MOVQ $0x00000004, ret+24(FP) 920 RET 921 922 // Return with overread error 923 error_overread: 924 MOVQ $0x00000006, ret+24(FP) 925 RET 926 927 // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int 928 // Requires: BMI, BMI2, CMOV 929 TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 930 MOVQ br+8(FP), CX 931 MOVQ 32(CX), AX 932 MOVBQZX 40(CX), DX 933 MOVQ 24(CX), BX 934 MOVQ (CX), CX 935 ADDQ BX, CX 936 MOVQ CX, (SP) 937 MOVQ ctx+16(FP), CX 938 MOVQ 72(CX), SI 939 MOVQ 80(CX), DI 940 MOVQ 88(CX), R8 941 MOVQ 104(CX), R9 942 MOVQ s+0(FP), CX 943 MOVQ 144(CX), R10 944 MOVQ 152(CX), R11 945 MOVQ 160(CX), R12 946 947 sequenceDecs_decode_56_bmi2_main_loop: 948 MOVQ (SP), R13 949 950 // Fill bitreader to have enough for the offset and match length. 951 CMPQ BX, $0x08 952 JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte 953 MOVQ DX, CX 954 SHRQ $0x03, CX 955 SUBQ CX, R13 956 MOVQ (R13), AX 957 SUBQ CX, BX 958 ANDQ $0x07, DX 959 JMP sequenceDecs_decode_56_bmi2_fill_end 960 961 sequenceDecs_decode_56_bmi2_fill_byte_by_byte: 962 CMPQ BX, $0x00 963 JLE sequenceDecs_decode_56_bmi2_fill_check_overread 964 CMPQ DX, $0x07 965 JLE sequenceDecs_decode_56_bmi2_fill_end 966 SHLQ $0x08, AX 967 SUBQ $0x01, R13 968 SUBQ $0x01, BX 969 SUBQ $0x08, DX 970 MOVBQZX (R13), CX 971 ORQ CX, AX 972 JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte 973 974 sequenceDecs_decode_56_bmi2_fill_check_overread: 975 CMPQ DX, $0x40 976 JA error_overread 977 978 sequenceDecs_decode_56_bmi2_fill_end: 979 // Update offset 980 MOVQ $0x00000808, CX 981 BEXTRQ CX, R8, R14 982 MOVQ AX, R15 983 LEAQ (DX)(R14*1), CX 984 ROLQ CL, R15 985 BZHIQ R14, R15, R15 986 MOVQ CX, DX 987 MOVQ R8, CX 988 SHRQ $0x20, CX 989 ADDQ R15, CX 990 MOVQ CX, 16(R9) 991 992 // Update match length 993 MOVQ $0x00000808, CX 994 BEXTRQ CX, DI, R14 995 MOVQ AX, R15 996 LEAQ (DX)(R14*1), CX 997 ROLQ CL, R15 998 BZHIQ R14, R15, R15 999 MOVQ CX, DX 1000 MOVQ DI, CX 1001 SHRQ $0x20, CX 1002 ADDQ R15, CX 1003 MOVQ CX, 8(R9) 1004 1005 // Update literal length 1006 MOVQ $0x00000808, CX 1007 BEXTRQ CX, SI, R14 1008 MOVQ AX, R15 1009 LEAQ (DX)(R14*1), CX 1010 ROLQ CL, R15 1011 BZHIQ R14, R15, R15 1012 MOVQ CX, DX 1013 MOVQ SI, CX 1014 SHRQ $0x20, CX 1015 ADDQ R15, CX 1016 MOVQ CX, (R9) 1017 1018 // Fill bitreader for state updates 1019 MOVQ R13, (SP) 1020 MOVQ $0x00000808, CX 1021 BEXTRQ CX, R8, R13 1022 MOVQ ctx+16(FP), CX 1023 CMPQ 96(CX), $0x00 1024 JZ sequenceDecs_decode_56_bmi2_skip_update 1025 LEAQ (SI)(DI*1), R14 1026 ADDQ R8, R14 1027 MOVBQZX R14, R14 1028 LEAQ (DX)(R14*1), CX 1029 MOVQ AX, R15 1030 MOVQ CX, DX 1031 ROLQ CL, R15 1032 BZHIQ R14, R15, R15 1033 1034 // Update Offset State 1035 BZHIQ R8, R15, CX 1036 SHRXQ R8, R15, R15 1037 MOVQ $0x00001010, R14 1038 BEXTRQ R14, R8, R8 1039 ADDQ CX, R8 1040 1041 // Load ctx.ofTable 1042 MOVQ ctx+16(FP), CX 1043 MOVQ 48(CX), CX 1044 MOVQ (CX)(R8*8), R8 1045 1046 // Update Match Length State 1047 BZHIQ DI, R15, CX 1048 SHRXQ DI, R15, R15 1049 MOVQ $0x00001010, R14 1050 BEXTRQ R14, DI, DI 1051 ADDQ CX, DI 1052 1053 // Load ctx.mlTable 1054 MOVQ ctx+16(FP), CX 1055 MOVQ 24(CX), CX 1056 MOVQ (CX)(DI*8), DI 1057 1058 // Update Literal Length State 1059 BZHIQ SI, R15, CX 1060 MOVQ $0x00001010, R14 1061 BEXTRQ R14, SI, SI 1062 ADDQ CX, SI 1063 1064 // Load ctx.llTable 1065 MOVQ ctx+16(FP), CX 1066 MOVQ (CX), CX 1067 MOVQ (CX)(SI*8), SI 1068 1069 sequenceDecs_decode_56_bmi2_skip_update: 1070 // Adjust offset 1071 MOVQ 16(R9), CX 1072 CMPQ R13, $0x01 1073 JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 1074 MOVQ R11, R12 1075 MOVQ R10, R11 1076 MOVQ CX, R10 1077 JMP sequenceDecs_decode_56_bmi2_after_adjust 1078 1079 sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: 1080 CMPQ (R9), $0x00000000 1081 JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero 1082 INCQ CX 1083 JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero 1084 1085 sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: 1086 TESTQ CX, CX 1087 JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero 1088 MOVQ R10, CX 1089 JMP sequenceDecs_decode_56_bmi2_after_adjust 1090 1091 sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: 1092 CMPQ CX, $0x01 1093 JB sequenceDecs_decode_56_bmi2_adjust_zero 1094 JEQ sequenceDecs_decode_56_bmi2_adjust_one 1095 CMPQ CX, $0x02 1096 JA sequenceDecs_decode_56_bmi2_adjust_three 1097 JMP sequenceDecs_decode_56_bmi2_adjust_two 1098 1099 sequenceDecs_decode_56_bmi2_adjust_zero: 1100 MOVQ R10, R13 1101 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid 1102 1103 sequenceDecs_decode_56_bmi2_adjust_one: 1104 MOVQ R11, R13 1105 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid 1106 1107 sequenceDecs_decode_56_bmi2_adjust_two: 1108 MOVQ R12, R13 1109 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid 1110 1111 sequenceDecs_decode_56_bmi2_adjust_three: 1112 LEAQ -1(R10), R13 1113 1114 sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: 1115 TESTQ R13, R13 1116 JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid 1117 MOVQ $0x00000001, R13 1118 1119 sequenceDecs_decode_56_bmi2_adjust_temp_valid: 1120 CMPQ CX, $0x01 1121 CMOVQNE R11, R12 1122 MOVQ R10, R11 1123 MOVQ R13, R10 1124 MOVQ R13, CX 1125 1126 sequenceDecs_decode_56_bmi2_after_adjust: 1127 MOVQ CX, 16(R9) 1128 1129 // Check values 1130 MOVQ 8(R9), R13 1131 MOVQ (R9), R14 1132 LEAQ (R13)(R14*1), R15 1133 MOVQ s+0(FP), BP 1134 ADDQ R15, 256(BP) 1135 MOVQ ctx+16(FP), R15 1136 SUBQ R14, 128(R15) 1137 JS error_not_enough_literals 1138 CMPQ R13, $0x00020002 1139 JA sequenceDecs_decode_56_bmi2_error_match_len_too_big 1140 TESTQ CX, CX 1141 JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok 1142 TESTQ R13, R13 1143 JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch 1144 1145 sequenceDecs_decode_56_bmi2_match_len_ofs_ok: 1146 ADDQ $0x18, R9 1147 MOVQ ctx+16(FP), CX 1148 DECQ 96(CX) 1149 JNS sequenceDecs_decode_56_bmi2_main_loop 1150 MOVQ s+0(FP), CX 1151 MOVQ R10, 144(CX) 1152 MOVQ R11, 152(CX) 1153 MOVQ R12, 160(CX) 1154 MOVQ br+8(FP), CX 1155 MOVQ AX, 32(CX) 1156 MOVB DL, 40(CX) 1157 MOVQ BX, 24(CX) 1158 1159 // Return success 1160 MOVQ $0x00000000, ret+24(FP) 1161 RET 1162 1163 // Return with match length error 1164 sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: 1165 MOVQ $0x00000001, ret+24(FP) 1166 RET 1167 1168 // Return with match too long error 1169 sequenceDecs_decode_56_bmi2_error_match_len_too_big: 1170 MOVQ $0x00000002, ret+24(FP) 1171 RET 1172 1173 // Return with match offset too long error 1174 MOVQ $0x00000003, ret+24(FP) 1175 RET 1176 1177 // Return with not enough literals error 1178 error_not_enough_literals: 1179 MOVQ $0x00000004, ret+24(FP) 1180 RET 1181 1182 // Return with overread error 1183 error_overread: 1184 MOVQ $0x00000006, ret+24(FP) 1185 RET 1186 1187 // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool 1188 // Requires: SSE 1189 TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 1190 MOVQ ctx+0(FP), R10 1191 MOVQ 8(R10), CX 1192 TESTQ CX, CX 1193 JZ empty_seqs 1194 MOVQ (R10), AX 1195 MOVQ 24(R10), DX 1196 MOVQ 32(R10), BX 1197 MOVQ 80(R10), SI 1198 MOVQ 104(R10), DI 1199 MOVQ 120(R10), R8 1200 MOVQ 56(R10), R9 1201 MOVQ 64(R10), R10 1202 ADDQ R10, R9 1203 1204 // seqsBase += 24 * seqIndex 1205 LEAQ (DX)(DX*2), R11 1206 SHLQ $0x03, R11 1207 ADDQ R11, AX 1208 1209 // outBase += outPosition 1210 ADDQ DI, BX 1211 1212 main_loop: 1213 MOVQ (AX), R11 1214 MOVQ 16(AX), R12 1215 MOVQ 8(AX), R13 1216 1217 // Copy literals 1218 TESTQ R11, R11 1219 JZ check_offset 1220 XORQ R14, R14 1221 1222 copy_1: 1223 MOVUPS (SI)(R14*1), X0 1224 MOVUPS X0, (BX)(R14*1) 1225 ADDQ $0x10, R14 1226 CMPQ R14, R11 1227 JB copy_1 1228 ADDQ R11, SI 1229 ADDQ R11, BX 1230 ADDQ R11, DI 1231 1232 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 1233 check_offset: 1234 LEAQ (DI)(R10*1), R11 1235 CMPQ R12, R11 1236 JG error_match_off_too_big 1237 CMPQ R12, R8 1238 JG error_match_off_too_big 1239 1240 // Copy match from history 1241 MOVQ R12, R11 1242 SUBQ DI, R11 1243 JLS copy_match 1244 MOVQ R9, R14 1245 SUBQ R11, R14 1246 CMPQ R13, R11 1247 JG copy_all_from_history 1248 MOVQ R13, R11 1249 SUBQ $0x10, R11 1250 JB copy_4_small 1251 1252 copy_4_loop: 1253 MOVUPS (R14), X0 1254 MOVUPS X0, (BX) 1255 ADDQ $0x10, R14 1256 ADDQ $0x10, BX 1257 SUBQ $0x10, R11 1258 JAE copy_4_loop 1259 LEAQ 16(R14)(R11*1), R14 1260 LEAQ 16(BX)(R11*1), BX 1261 MOVUPS -16(R14), X0 1262 MOVUPS X0, -16(BX) 1263 JMP copy_4_end 1264 1265 copy_4_small: 1266 CMPQ R13, $0x03 1267 JE copy_4_move_3 1268 CMPQ R13, $0x08 1269 JB copy_4_move_4through7 1270 JMP copy_4_move_8through16 1271 1272 copy_4_move_3: 1273 MOVW (R14), R11 1274 MOVB 2(R14), R12 1275 MOVW R11, (BX) 1276 MOVB R12, 2(BX) 1277 ADDQ R13, R14 1278 ADDQ R13, BX 1279 JMP copy_4_end 1280 1281 copy_4_move_4through7: 1282 MOVL (R14), R11 1283 MOVL -4(R14)(R13*1), R12 1284 MOVL R11, (BX) 1285 MOVL R12, -4(BX)(R13*1) 1286 ADDQ R13, R14 1287 ADDQ R13, BX 1288 JMP copy_4_end 1289 1290 copy_4_move_8through16: 1291 MOVQ (R14), R11 1292 MOVQ -8(R14)(R13*1), R12 1293 MOVQ R11, (BX) 1294 MOVQ R12, -8(BX)(R13*1) 1295 ADDQ R13, R14 1296 ADDQ R13, BX 1297 1298 copy_4_end: 1299 ADDQ R13, DI 1300 ADDQ $0x18, AX 1301 INCQ DX 1302 CMPQ DX, CX 1303 JB main_loop 1304 JMP loop_finished 1305 1306 copy_all_from_history: 1307 MOVQ R11, R15 1308 SUBQ $0x10, R15 1309 JB copy_5_small 1310 1311 copy_5_loop: 1312 MOVUPS (R14), X0 1313 MOVUPS X0, (BX) 1314 ADDQ $0x10, R14 1315 ADDQ $0x10, BX 1316 SUBQ $0x10, R15 1317 JAE copy_5_loop 1318 LEAQ 16(R14)(R15*1), R14 1319 LEAQ 16(BX)(R15*1), BX 1320 MOVUPS -16(R14), X0 1321 MOVUPS X0, -16(BX) 1322 JMP copy_5_end 1323 1324 copy_5_small: 1325 CMPQ R11, $0x03 1326 JE copy_5_move_3 1327 JB copy_5_move_1or2 1328 CMPQ R11, $0x08 1329 JB copy_5_move_4through7 1330 JMP copy_5_move_8through16 1331 1332 copy_5_move_1or2: 1333 MOVB (R14), R15 1334 MOVB -1(R14)(R11*1), BP 1335 MOVB R15, (BX) 1336 MOVB BP, -1(BX)(R11*1) 1337 ADDQ R11, R14 1338 ADDQ R11, BX 1339 JMP copy_5_end 1340 1341 copy_5_move_3: 1342 MOVW (R14), R15 1343 MOVB 2(R14), BP 1344 MOVW R15, (BX) 1345 MOVB BP, 2(BX) 1346 ADDQ R11, R14 1347 ADDQ R11, BX 1348 JMP copy_5_end 1349 1350 copy_5_move_4through7: 1351 MOVL (R14), R15 1352 MOVL -4(R14)(R11*1), BP 1353 MOVL R15, (BX) 1354 MOVL BP, -4(BX)(R11*1) 1355 ADDQ R11, R14 1356 ADDQ R11, BX 1357 JMP copy_5_end 1358 1359 copy_5_move_8through16: 1360 MOVQ (R14), R15 1361 MOVQ -8(R14)(R11*1), BP 1362 MOVQ R15, (BX) 1363 MOVQ BP, -8(BX)(R11*1) 1364 ADDQ R11, R14 1365 ADDQ R11, BX 1366 1367 copy_5_end: 1368 ADDQ R11, DI 1369 SUBQ R11, R13 1370 1371 // Copy match from the current buffer 1372 copy_match: 1373 MOVQ BX, R11 1374 SUBQ R12, R11 1375 1376 // ml <= mo 1377 CMPQ R13, R12 1378 JA copy_overlapping_match 1379 1380 // Copy non-overlapping match 1381 ADDQ R13, DI 1382 MOVQ BX, R12 1383 ADDQ R13, BX 1384 1385 copy_2: 1386 MOVUPS (R11), X0 1387 MOVUPS X0, (R12) 1388 ADDQ $0x10, R11 1389 ADDQ $0x10, R12 1390 SUBQ $0x10, R13 1391 JHI copy_2 1392 JMP handle_loop 1393 1394 // Copy overlapping match 1395 copy_overlapping_match: 1396 ADDQ R13, DI 1397 1398 copy_slow_3: 1399 MOVB (R11), R12 1400 MOVB R12, (BX) 1401 INCQ R11 1402 INCQ BX 1403 DECQ R13 1404 JNZ copy_slow_3 1405 1406 handle_loop: 1407 ADDQ $0x18, AX 1408 INCQ DX 1409 CMPQ DX, CX 1410 JB main_loop 1411 1412 loop_finished: 1413 // Return value 1414 MOVB $0x01, ret+8(FP) 1415 1416 // Update the context 1417 MOVQ ctx+0(FP), AX 1418 MOVQ DX, 24(AX) 1419 MOVQ DI, 104(AX) 1420 SUBQ 80(AX), SI 1421 MOVQ SI, 112(AX) 1422 RET 1423 1424 error_match_off_too_big: 1425 // Return value 1426 MOVB $0x00, ret+8(FP) 1427 1428 // Update the context 1429 MOVQ ctx+0(FP), AX 1430 MOVQ DX, 24(AX) 1431 MOVQ DI, 104(AX) 1432 SUBQ 80(AX), SI 1433 MOVQ SI, 112(AX) 1434 RET 1435 1436 empty_seqs: 1437 // Return value 1438 MOVB $0x01, ret+8(FP) 1439 RET 1440 1441 // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool 1442 // Requires: SSE 1443 TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 1444 MOVQ ctx+0(FP), R10 1445 MOVQ 8(R10), CX 1446 TESTQ CX, CX 1447 JZ empty_seqs 1448 MOVQ (R10), AX 1449 MOVQ 24(R10), DX 1450 MOVQ 32(R10), BX 1451 MOVQ 80(R10), SI 1452 MOVQ 104(R10), DI 1453 MOVQ 120(R10), R8 1454 MOVQ 56(R10), R9 1455 MOVQ 64(R10), R10 1456 ADDQ R10, R9 1457 1458 // seqsBase += 24 * seqIndex 1459 LEAQ (DX)(DX*2), R11 1460 SHLQ $0x03, R11 1461 ADDQ R11, AX 1462 1463 // outBase += outPosition 1464 ADDQ DI, BX 1465 1466 main_loop: 1467 MOVQ (AX), R11 1468 MOVQ 16(AX), R12 1469 MOVQ 8(AX), R13 1470 1471 // Copy literals 1472 TESTQ R11, R11 1473 JZ check_offset 1474 MOVQ R11, R14 1475 SUBQ $0x10, R14 1476 JB copy_1_small 1477 1478 copy_1_loop: 1479 MOVUPS (SI), X0 1480 MOVUPS X0, (BX) 1481 ADDQ $0x10, SI 1482 ADDQ $0x10, BX 1483 SUBQ $0x10, R14 1484 JAE copy_1_loop 1485 LEAQ 16(SI)(R14*1), SI 1486 LEAQ 16(BX)(R14*1), BX 1487 MOVUPS -16(SI), X0 1488 MOVUPS X0, -16(BX) 1489 JMP copy_1_end 1490 1491 copy_1_small: 1492 CMPQ R11, $0x03 1493 JE copy_1_move_3 1494 JB copy_1_move_1or2 1495 CMPQ R11, $0x08 1496 JB copy_1_move_4through7 1497 JMP copy_1_move_8through16 1498 1499 copy_1_move_1or2: 1500 MOVB (SI), R14 1501 MOVB -1(SI)(R11*1), R15 1502 MOVB R14, (BX) 1503 MOVB R15, -1(BX)(R11*1) 1504 ADDQ R11, SI 1505 ADDQ R11, BX 1506 JMP copy_1_end 1507 1508 copy_1_move_3: 1509 MOVW (SI), R14 1510 MOVB 2(SI), R15 1511 MOVW R14, (BX) 1512 MOVB R15, 2(BX) 1513 ADDQ R11, SI 1514 ADDQ R11, BX 1515 JMP copy_1_end 1516 1517 copy_1_move_4through7: 1518 MOVL (SI), R14 1519 MOVL -4(SI)(R11*1), R15 1520 MOVL R14, (BX) 1521 MOVL R15, -4(BX)(R11*1) 1522 ADDQ R11, SI 1523 ADDQ R11, BX 1524 JMP copy_1_end 1525 1526 copy_1_move_8through16: 1527 MOVQ (SI), R14 1528 MOVQ -8(SI)(R11*1), R15 1529 MOVQ R14, (BX) 1530 MOVQ R15, -8(BX)(R11*1) 1531 ADDQ R11, SI 1532 ADDQ R11, BX 1533 1534 copy_1_end: 1535 ADDQ R11, DI 1536 1537 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 1538 check_offset: 1539 LEAQ (DI)(R10*1), R11 1540 CMPQ R12, R11 1541 JG error_match_off_too_big 1542 CMPQ R12, R8 1543 JG error_match_off_too_big 1544 1545 // Copy match from history 1546 MOVQ R12, R11 1547 SUBQ DI, R11 1548 JLS copy_match 1549 MOVQ R9, R14 1550 SUBQ R11, R14 1551 CMPQ R13, R11 1552 JG copy_all_from_history 1553 MOVQ R13, R11 1554 SUBQ $0x10, R11 1555 JB copy_4_small 1556 1557 copy_4_loop: 1558 MOVUPS (R14), X0 1559 MOVUPS X0, (BX) 1560 ADDQ $0x10, R14 1561 ADDQ $0x10, BX 1562 SUBQ $0x10, R11 1563 JAE copy_4_loop 1564 LEAQ 16(R14)(R11*1), R14 1565 LEAQ 16(BX)(R11*1), BX 1566 MOVUPS -16(R14), X0 1567 MOVUPS X0, -16(BX) 1568 JMP copy_4_end 1569 1570 copy_4_small: 1571 CMPQ R13, $0x03 1572 JE copy_4_move_3 1573 CMPQ R13, $0x08 1574 JB copy_4_move_4through7 1575 JMP copy_4_move_8through16 1576 1577 copy_4_move_3: 1578 MOVW (R14), R11 1579 MOVB 2(R14), R12 1580 MOVW R11, (BX) 1581 MOVB R12, 2(BX) 1582 ADDQ R13, R14 1583 ADDQ R13, BX 1584 JMP copy_4_end 1585 1586 copy_4_move_4through7: 1587 MOVL (R14), R11 1588 MOVL -4(R14)(R13*1), R12 1589 MOVL R11, (BX) 1590 MOVL R12, -4(BX)(R13*1) 1591 ADDQ R13, R14 1592 ADDQ R13, BX 1593 JMP copy_4_end 1594 1595 copy_4_move_8through16: 1596 MOVQ (R14), R11 1597 MOVQ -8(R14)(R13*1), R12 1598 MOVQ R11, (BX) 1599 MOVQ R12, -8(BX)(R13*1) 1600 ADDQ R13, R14 1601 ADDQ R13, BX 1602 1603 copy_4_end: 1604 ADDQ R13, DI 1605 ADDQ $0x18, AX 1606 INCQ DX 1607 CMPQ DX, CX 1608 JB main_loop 1609 JMP loop_finished 1610 1611 copy_all_from_history: 1612 MOVQ R11, R15 1613 SUBQ $0x10, R15 1614 JB copy_5_small 1615 1616 copy_5_loop: 1617 MOVUPS (R14), X0 1618 MOVUPS X0, (BX) 1619 ADDQ $0x10, R14 1620 ADDQ $0x10, BX 1621 SUBQ $0x10, R15 1622 JAE copy_5_loop 1623 LEAQ 16(R14)(R15*1), R14 1624 LEAQ 16(BX)(R15*1), BX 1625 MOVUPS -16(R14), X0 1626 MOVUPS X0, -16(BX) 1627 JMP copy_5_end 1628 1629 copy_5_small: 1630 CMPQ R11, $0x03 1631 JE copy_5_move_3 1632 JB copy_5_move_1or2 1633 CMPQ R11, $0x08 1634 JB copy_5_move_4through7 1635 JMP copy_5_move_8through16 1636 1637 copy_5_move_1or2: 1638 MOVB (R14), R15 1639 MOVB -1(R14)(R11*1), BP 1640 MOVB R15, (BX) 1641 MOVB BP, -1(BX)(R11*1) 1642 ADDQ R11, R14 1643 ADDQ R11, BX 1644 JMP copy_5_end 1645 1646 copy_5_move_3: 1647 MOVW (R14), R15 1648 MOVB 2(R14), BP 1649 MOVW R15, (BX) 1650 MOVB BP, 2(BX) 1651 ADDQ R11, R14 1652 ADDQ R11, BX 1653 JMP copy_5_end 1654 1655 copy_5_move_4through7: 1656 MOVL (R14), R15 1657 MOVL -4(R14)(R11*1), BP 1658 MOVL R15, (BX) 1659 MOVL BP, -4(BX)(R11*1) 1660 ADDQ R11, R14 1661 ADDQ R11, BX 1662 JMP copy_5_end 1663 1664 copy_5_move_8through16: 1665 MOVQ (R14), R15 1666 MOVQ -8(R14)(R11*1), BP 1667 MOVQ R15, (BX) 1668 MOVQ BP, -8(BX)(R11*1) 1669 ADDQ R11, R14 1670 ADDQ R11, BX 1671 1672 copy_5_end: 1673 ADDQ R11, DI 1674 SUBQ R11, R13 1675 1676 // Copy match from the current buffer 1677 copy_match: 1678 MOVQ BX, R11 1679 SUBQ R12, R11 1680 1681 // ml <= mo 1682 CMPQ R13, R12 1683 JA copy_overlapping_match 1684 1685 // Copy non-overlapping match 1686 ADDQ R13, DI 1687 MOVQ R13, R12 1688 SUBQ $0x10, R12 1689 JB copy_2_small 1690 1691 copy_2_loop: 1692 MOVUPS (R11), X0 1693 MOVUPS X0, (BX) 1694 ADDQ $0x10, R11 1695 ADDQ $0x10, BX 1696 SUBQ $0x10, R12 1697 JAE copy_2_loop 1698 LEAQ 16(R11)(R12*1), R11 1699 LEAQ 16(BX)(R12*1), BX 1700 MOVUPS -16(R11), X0 1701 MOVUPS X0, -16(BX) 1702 JMP copy_2_end 1703 1704 copy_2_small: 1705 CMPQ R13, $0x03 1706 JE copy_2_move_3 1707 JB copy_2_move_1or2 1708 CMPQ R13, $0x08 1709 JB copy_2_move_4through7 1710 JMP copy_2_move_8through16 1711 1712 copy_2_move_1or2: 1713 MOVB (R11), R12 1714 MOVB -1(R11)(R13*1), R14 1715 MOVB R12, (BX) 1716 MOVB R14, -1(BX)(R13*1) 1717 ADDQ R13, R11 1718 ADDQ R13, BX 1719 JMP copy_2_end 1720 1721 copy_2_move_3: 1722 MOVW (R11), R12 1723 MOVB 2(R11), R14 1724 MOVW R12, (BX) 1725 MOVB R14, 2(BX) 1726 ADDQ R13, R11 1727 ADDQ R13, BX 1728 JMP copy_2_end 1729 1730 copy_2_move_4through7: 1731 MOVL (R11), R12 1732 MOVL -4(R11)(R13*1), R14 1733 MOVL R12, (BX) 1734 MOVL R14, -4(BX)(R13*1) 1735 ADDQ R13, R11 1736 ADDQ R13, BX 1737 JMP copy_2_end 1738 1739 copy_2_move_8through16: 1740 MOVQ (R11), R12 1741 MOVQ -8(R11)(R13*1), R14 1742 MOVQ R12, (BX) 1743 MOVQ R14, -8(BX)(R13*1) 1744 ADDQ R13, R11 1745 ADDQ R13, BX 1746 1747 copy_2_end: 1748 JMP handle_loop 1749 1750 // Copy overlapping match 1751 copy_overlapping_match: 1752 ADDQ R13, DI 1753 1754 copy_slow_3: 1755 MOVB (R11), R12 1756 MOVB R12, (BX) 1757 INCQ R11 1758 INCQ BX 1759 DECQ R13 1760 JNZ copy_slow_3 1761 1762 handle_loop: 1763 ADDQ $0x18, AX 1764 INCQ DX 1765 CMPQ DX, CX 1766 JB main_loop 1767 1768 loop_finished: 1769 // Return value 1770 MOVB $0x01, ret+8(FP) 1771 1772 // Update the context 1773 MOVQ ctx+0(FP), AX 1774 MOVQ DX, 24(AX) 1775 MOVQ DI, 104(AX) 1776 SUBQ 80(AX), SI 1777 MOVQ SI, 112(AX) 1778 RET 1779 1780 error_match_off_too_big: 1781 // Return value 1782 MOVB $0x00, ret+8(FP) 1783 1784 // Update the context 1785 MOVQ ctx+0(FP), AX 1786 MOVQ DX, 24(AX) 1787 MOVQ DI, 104(AX) 1788 SUBQ 80(AX), SI 1789 MOVQ SI, 112(AX) 1790 RET 1791 1792 empty_seqs: 1793 // Return value 1794 MOVB $0x01, ret+8(FP) 1795 RET 1796 1797 // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int 1798 // Requires: CMOV, SSE 1799 TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 1800 MOVQ br+8(FP), AX 1801 MOVQ 32(AX), DX 1802 MOVBQZX 40(AX), BX 1803 MOVQ 24(AX), SI 1804 MOVQ (AX), AX 1805 ADDQ SI, AX 1806 MOVQ AX, (SP) 1807 MOVQ ctx+16(FP), AX 1808 MOVQ 72(AX), DI 1809 MOVQ 80(AX), R8 1810 MOVQ 88(AX), R9 1811 XORQ CX, CX 1812 MOVQ CX, 8(SP) 1813 MOVQ CX, 16(SP) 1814 MOVQ CX, 24(SP) 1815 MOVQ 112(AX), R10 1816 MOVQ 128(AX), CX 1817 MOVQ CX, 32(SP) 1818 MOVQ 144(AX), R11 1819 MOVQ 136(AX), R12 1820 MOVQ 200(AX), CX 1821 MOVQ CX, 56(SP) 1822 MOVQ 176(AX), CX 1823 MOVQ CX, 48(SP) 1824 MOVQ 184(AX), AX 1825 MOVQ AX, 40(SP) 1826 MOVQ 40(SP), AX 1827 ADDQ AX, 48(SP) 1828 1829 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) 1830 ADDQ R10, 32(SP) 1831 1832 // outBase += outPosition 1833 ADDQ R12, R10 1834 1835 sequenceDecs_decodeSync_amd64_main_loop: 1836 MOVQ (SP), R13 1837 1838 // Fill bitreader to have enough for the offset and match length. 1839 CMPQ SI, $0x08 1840 JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte 1841 MOVQ BX, AX 1842 SHRQ $0x03, AX 1843 SUBQ AX, R13 1844 MOVQ (R13), DX 1845 SUBQ AX, SI 1846 ANDQ $0x07, BX 1847 JMP sequenceDecs_decodeSync_amd64_fill_end 1848 1849 sequenceDecs_decodeSync_amd64_fill_byte_by_byte: 1850 CMPQ SI, $0x00 1851 JLE sequenceDecs_decodeSync_amd64_fill_check_overread 1852 CMPQ BX, $0x07 1853 JLE sequenceDecs_decodeSync_amd64_fill_end 1854 SHLQ $0x08, DX 1855 SUBQ $0x01, R13 1856 SUBQ $0x01, SI 1857 SUBQ $0x08, BX 1858 MOVBQZX (R13), AX 1859 ORQ AX, DX 1860 JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte 1861 1862 sequenceDecs_decodeSync_amd64_fill_check_overread: 1863 CMPQ BX, $0x40 1864 JA error_overread 1865 1866 sequenceDecs_decodeSync_amd64_fill_end: 1867 // Update offset 1868 MOVQ R9, AX 1869 MOVQ BX, CX 1870 MOVQ DX, R14 1871 SHLQ CL, R14 1872 MOVB AH, CL 1873 SHRQ $0x20, AX 1874 TESTQ CX, CX 1875 JZ sequenceDecs_decodeSync_amd64_of_update_zero 1876 ADDQ CX, BX 1877 CMPQ BX, $0x40 1878 JA sequenceDecs_decodeSync_amd64_of_update_zero 1879 CMPQ CX, $0x40 1880 JAE sequenceDecs_decodeSync_amd64_of_update_zero 1881 NEGQ CX 1882 SHRQ CL, R14 1883 ADDQ R14, AX 1884 1885 sequenceDecs_decodeSync_amd64_of_update_zero: 1886 MOVQ AX, 8(SP) 1887 1888 // Update match length 1889 MOVQ R8, AX 1890 MOVQ BX, CX 1891 MOVQ DX, R14 1892 SHLQ CL, R14 1893 MOVB AH, CL 1894 SHRQ $0x20, AX 1895 TESTQ CX, CX 1896 JZ sequenceDecs_decodeSync_amd64_ml_update_zero 1897 ADDQ CX, BX 1898 CMPQ BX, $0x40 1899 JA sequenceDecs_decodeSync_amd64_ml_update_zero 1900 CMPQ CX, $0x40 1901 JAE sequenceDecs_decodeSync_amd64_ml_update_zero 1902 NEGQ CX 1903 SHRQ CL, R14 1904 ADDQ R14, AX 1905 1906 sequenceDecs_decodeSync_amd64_ml_update_zero: 1907 MOVQ AX, 16(SP) 1908 1909 // Fill bitreader to have enough for the remaining 1910 CMPQ SI, $0x08 1911 JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte 1912 MOVQ BX, AX 1913 SHRQ $0x03, AX 1914 SUBQ AX, R13 1915 MOVQ (R13), DX 1916 SUBQ AX, SI 1917 ANDQ $0x07, BX 1918 JMP sequenceDecs_decodeSync_amd64_fill_2_end 1919 1920 sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: 1921 CMPQ SI, $0x00 1922 JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread 1923 CMPQ BX, $0x07 1924 JLE sequenceDecs_decodeSync_amd64_fill_2_end 1925 SHLQ $0x08, DX 1926 SUBQ $0x01, R13 1927 SUBQ $0x01, SI 1928 SUBQ $0x08, BX 1929 MOVBQZX (R13), AX 1930 ORQ AX, DX 1931 JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte 1932 1933 sequenceDecs_decodeSync_amd64_fill_2_check_overread: 1934 CMPQ BX, $0x40 1935 JA error_overread 1936 1937 sequenceDecs_decodeSync_amd64_fill_2_end: 1938 // Update literal length 1939 MOVQ DI, AX 1940 MOVQ BX, CX 1941 MOVQ DX, R14 1942 SHLQ CL, R14 1943 MOVB AH, CL 1944 SHRQ $0x20, AX 1945 TESTQ CX, CX 1946 JZ sequenceDecs_decodeSync_amd64_ll_update_zero 1947 ADDQ CX, BX 1948 CMPQ BX, $0x40 1949 JA sequenceDecs_decodeSync_amd64_ll_update_zero 1950 CMPQ CX, $0x40 1951 JAE sequenceDecs_decodeSync_amd64_ll_update_zero 1952 NEGQ CX 1953 SHRQ CL, R14 1954 ADDQ R14, AX 1955 1956 sequenceDecs_decodeSync_amd64_ll_update_zero: 1957 MOVQ AX, 24(SP) 1958 1959 // Fill bitreader for state updates 1960 MOVQ R13, (SP) 1961 MOVQ R9, AX 1962 SHRQ $0x08, AX 1963 MOVBQZX AL, AX 1964 MOVQ ctx+16(FP), CX 1965 CMPQ 96(CX), $0x00 1966 JZ sequenceDecs_decodeSync_amd64_skip_update 1967 1968 // Update Literal Length State 1969 MOVBQZX DI, R13 1970 SHRQ $0x10, DI 1971 MOVWQZX DI, DI 1972 LEAQ (BX)(R13*1), CX 1973 MOVQ DX, R14 1974 MOVQ CX, BX 1975 ROLQ CL, R14 1976 MOVL $0x00000001, R15 1977 MOVB R13, CL 1978 SHLL CL, R15 1979 DECL R15 1980 ANDQ R15, R14 1981 ADDQ R14, DI 1982 1983 // Load ctx.llTable 1984 MOVQ ctx+16(FP), CX 1985 MOVQ (CX), CX 1986 MOVQ (CX)(DI*8), DI 1987 1988 // Update Match Length State 1989 MOVBQZX R8, R13 1990 SHRQ $0x10, R8 1991 MOVWQZX R8, R8 1992 LEAQ (BX)(R13*1), CX 1993 MOVQ DX, R14 1994 MOVQ CX, BX 1995 ROLQ CL, R14 1996 MOVL $0x00000001, R15 1997 MOVB R13, CL 1998 SHLL CL, R15 1999 DECL R15 2000 ANDQ R15, R14 2001 ADDQ R14, R8 2002 2003 // Load ctx.mlTable 2004 MOVQ ctx+16(FP), CX 2005 MOVQ 24(CX), CX 2006 MOVQ (CX)(R8*8), R8 2007 2008 // Update Offset State 2009 MOVBQZX R9, R13 2010 SHRQ $0x10, R9 2011 MOVWQZX R9, R9 2012 LEAQ (BX)(R13*1), CX 2013 MOVQ DX, R14 2014 MOVQ CX, BX 2015 ROLQ CL, R14 2016 MOVL $0x00000001, R15 2017 MOVB R13, CL 2018 SHLL CL, R15 2019 DECL R15 2020 ANDQ R15, R14 2021 ADDQ R14, R9 2022 2023 // Load ctx.ofTable 2024 MOVQ ctx+16(FP), CX 2025 MOVQ 48(CX), CX 2026 MOVQ (CX)(R9*8), R9 2027 2028 sequenceDecs_decodeSync_amd64_skip_update: 2029 // Adjust offset 2030 MOVQ s+0(FP), CX 2031 MOVQ 8(SP), R13 2032 CMPQ AX, $0x01 2033 JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 2034 MOVUPS 144(CX), X0 2035 MOVQ R13, 144(CX) 2036 MOVUPS X0, 152(CX) 2037 JMP sequenceDecs_decodeSync_amd64_after_adjust 2038 2039 sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: 2040 CMPQ 24(SP), $0x00000000 2041 JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero 2042 INCQ R13 2043 JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero 2044 2045 sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: 2046 TESTQ R13, R13 2047 JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero 2048 MOVQ 144(CX), R13 2049 JMP sequenceDecs_decodeSync_amd64_after_adjust 2050 2051 sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: 2052 MOVQ R13, AX 2053 XORQ R14, R14 2054 MOVQ $-1, R15 2055 CMPQ R13, $0x03 2056 CMOVQEQ R14, AX 2057 CMOVQEQ R15, R14 2058 ADDQ 144(CX)(AX*8), R14 2059 JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid 2060 MOVQ $0x00000001, R14 2061 2062 sequenceDecs_decodeSync_amd64_adjust_temp_valid: 2063 CMPQ R13, $0x01 2064 JZ sequenceDecs_decodeSync_amd64_adjust_skip 2065 MOVQ 152(CX), AX 2066 MOVQ AX, 160(CX) 2067 2068 sequenceDecs_decodeSync_amd64_adjust_skip: 2069 MOVQ 144(CX), AX 2070 MOVQ AX, 152(CX) 2071 MOVQ R14, 144(CX) 2072 MOVQ R14, R13 2073 2074 sequenceDecs_decodeSync_amd64_after_adjust: 2075 MOVQ R13, 8(SP) 2076 2077 // Check values 2078 MOVQ 16(SP), AX 2079 MOVQ 24(SP), CX 2080 LEAQ (AX)(CX*1), R14 2081 MOVQ s+0(FP), R15 2082 ADDQ R14, 256(R15) 2083 MOVQ ctx+16(FP), R14 2084 SUBQ CX, 104(R14) 2085 JS error_not_enough_literals 2086 CMPQ AX, $0x00020002 2087 JA sequenceDecs_decodeSync_amd64_error_match_len_too_big 2088 TESTQ R13, R13 2089 JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok 2090 TESTQ AX, AX 2091 JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch 2092 2093 sequenceDecs_decodeSync_amd64_match_len_ofs_ok: 2094 MOVQ 24(SP), AX 2095 MOVQ 8(SP), CX 2096 MOVQ 16(SP), R13 2097 2098 // Check if we have enough space in s.out 2099 LEAQ (AX)(R13*1), R14 2100 ADDQ R10, R14 2101 CMPQ R14, 32(SP) 2102 JA error_not_enough_space 2103 2104 // Copy literals 2105 TESTQ AX, AX 2106 JZ check_offset 2107 XORQ R14, R14 2108 2109 copy_1: 2110 MOVUPS (R11)(R14*1), X0 2111 MOVUPS X0, (R10)(R14*1) 2112 ADDQ $0x10, R14 2113 CMPQ R14, AX 2114 JB copy_1 2115 ADDQ AX, R11 2116 ADDQ AX, R10 2117 ADDQ AX, R12 2118 2119 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 2120 check_offset: 2121 MOVQ R12, AX 2122 ADDQ 40(SP), AX 2123 CMPQ CX, AX 2124 JG error_match_off_too_big 2125 CMPQ CX, 56(SP) 2126 JG error_match_off_too_big 2127 2128 // Copy match from history 2129 MOVQ CX, AX 2130 SUBQ R12, AX 2131 JLS copy_match 2132 MOVQ 48(SP), R14 2133 SUBQ AX, R14 2134 CMPQ R13, AX 2135 JG copy_all_from_history 2136 MOVQ R13, AX 2137 SUBQ $0x10, AX 2138 JB copy_4_small 2139 2140 copy_4_loop: 2141 MOVUPS (R14), X0 2142 MOVUPS X0, (R10) 2143 ADDQ $0x10, R14 2144 ADDQ $0x10, R10 2145 SUBQ $0x10, AX 2146 JAE copy_4_loop 2147 LEAQ 16(R14)(AX*1), R14 2148 LEAQ 16(R10)(AX*1), R10 2149 MOVUPS -16(R14), X0 2150 MOVUPS X0, -16(R10) 2151 JMP copy_4_end 2152 2153 copy_4_small: 2154 CMPQ R13, $0x03 2155 JE copy_4_move_3 2156 CMPQ R13, $0x08 2157 JB copy_4_move_4through7 2158 JMP copy_4_move_8through16 2159 2160 copy_4_move_3: 2161 MOVW (R14), AX 2162 MOVB 2(R14), CL 2163 MOVW AX, (R10) 2164 MOVB CL, 2(R10) 2165 ADDQ R13, R14 2166 ADDQ R13, R10 2167 JMP copy_4_end 2168 2169 copy_4_move_4through7: 2170 MOVL (R14), AX 2171 MOVL -4(R14)(R13*1), CX 2172 MOVL AX, (R10) 2173 MOVL CX, -4(R10)(R13*1) 2174 ADDQ R13, R14 2175 ADDQ R13, R10 2176 JMP copy_4_end 2177 2178 copy_4_move_8through16: 2179 MOVQ (R14), AX 2180 MOVQ -8(R14)(R13*1), CX 2181 MOVQ AX, (R10) 2182 MOVQ CX, -8(R10)(R13*1) 2183 ADDQ R13, R14 2184 ADDQ R13, R10 2185 2186 copy_4_end: 2187 ADDQ R13, R12 2188 JMP handle_loop 2189 JMP loop_finished 2190 2191 copy_all_from_history: 2192 MOVQ AX, R15 2193 SUBQ $0x10, R15 2194 JB copy_5_small 2195 2196 copy_5_loop: 2197 MOVUPS (R14), X0 2198 MOVUPS X0, (R10) 2199 ADDQ $0x10, R14 2200 ADDQ $0x10, R10 2201 SUBQ $0x10, R15 2202 JAE copy_5_loop 2203 LEAQ 16(R14)(R15*1), R14 2204 LEAQ 16(R10)(R15*1), R10 2205 MOVUPS -16(R14), X0 2206 MOVUPS X0, -16(R10) 2207 JMP copy_5_end 2208 2209 copy_5_small: 2210 CMPQ AX, $0x03 2211 JE copy_5_move_3 2212 JB copy_5_move_1or2 2213 CMPQ AX, $0x08 2214 JB copy_5_move_4through7 2215 JMP copy_5_move_8through16 2216 2217 copy_5_move_1or2: 2218 MOVB (R14), R15 2219 MOVB -1(R14)(AX*1), BP 2220 MOVB R15, (R10) 2221 MOVB BP, -1(R10)(AX*1) 2222 ADDQ AX, R14 2223 ADDQ AX, R10 2224 JMP copy_5_end 2225 2226 copy_5_move_3: 2227 MOVW (R14), R15 2228 MOVB 2(R14), BP 2229 MOVW R15, (R10) 2230 MOVB BP, 2(R10) 2231 ADDQ AX, R14 2232 ADDQ AX, R10 2233 JMP copy_5_end 2234 2235 copy_5_move_4through7: 2236 MOVL (R14), R15 2237 MOVL -4(R14)(AX*1), BP 2238 MOVL R15, (R10) 2239 MOVL BP, -4(R10)(AX*1) 2240 ADDQ AX, R14 2241 ADDQ AX, R10 2242 JMP copy_5_end 2243 2244 copy_5_move_8through16: 2245 MOVQ (R14), R15 2246 MOVQ -8(R14)(AX*1), BP 2247 MOVQ R15, (R10) 2248 MOVQ BP, -8(R10)(AX*1) 2249 ADDQ AX, R14 2250 ADDQ AX, R10 2251 2252 copy_5_end: 2253 ADDQ AX, R12 2254 SUBQ AX, R13 2255 2256 // Copy match from the current buffer 2257 copy_match: 2258 MOVQ R10, AX 2259 SUBQ CX, AX 2260 2261 // ml <= mo 2262 CMPQ R13, CX 2263 JA copy_overlapping_match 2264 2265 // Copy non-overlapping match 2266 ADDQ R13, R12 2267 MOVQ R10, CX 2268 ADDQ R13, R10 2269 2270 copy_2: 2271 MOVUPS (AX), X0 2272 MOVUPS X0, (CX) 2273 ADDQ $0x10, AX 2274 ADDQ $0x10, CX 2275 SUBQ $0x10, R13 2276 JHI copy_2 2277 JMP handle_loop 2278 2279 // Copy overlapping match 2280 copy_overlapping_match: 2281 ADDQ R13, R12 2282 2283 copy_slow_3: 2284 MOVB (AX), CL 2285 MOVB CL, (R10) 2286 INCQ AX 2287 INCQ R10 2288 DECQ R13 2289 JNZ copy_slow_3 2290 2291 handle_loop: 2292 MOVQ ctx+16(FP), AX 2293 DECQ 96(AX) 2294 JNS sequenceDecs_decodeSync_amd64_main_loop 2295 2296 loop_finished: 2297 MOVQ br+8(FP), AX 2298 MOVQ DX, 32(AX) 2299 MOVB BL, 40(AX) 2300 MOVQ SI, 24(AX) 2301 2302 // Update the context 2303 MOVQ ctx+16(FP), AX 2304 MOVQ R12, 136(AX) 2305 MOVQ 144(AX), CX 2306 SUBQ CX, R11 2307 MOVQ R11, 168(AX) 2308 2309 // Return success 2310 MOVQ $0x00000000, ret+24(FP) 2311 RET 2312 2313 // Return with match length error 2314 sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: 2315 MOVQ 16(SP), AX 2316 MOVQ ctx+16(FP), CX 2317 MOVQ AX, 216(CX) 2318 MOVQ $0x00000001, ret+24(FP) 2319 RET 2320 2321 // Return with match too long error 2322 sequenceDecs_decodeSync_amd64_error_match_len_too_big: 2323 MOVQ ctx+16(FP), AX 2324 MOVQ 16(SP), CX 2325 MOVQ CX, 216(AX) 2326 MOVQ $0x00000002, ret+24(FP) 2327 RET 2328 2329 // Return with match offset too long error 2330 error_match_off_too_big: 2331 MOVQ ctx+16(FP), AX 2332 MOVQ 8(SP), CX 2333 MOVQ CX, 224(AX) 2334 MOVQ R12, 136(AX) 2335 MOVQ $0x00000003, ret+24(FP) 2336 RET 2337 2338 // Return with not enough literals error 2339 error_not_enough_literals: 2340 MOVQ ctx+16(FP), AX 2341 MOVQ 24(SP), CX 2342 MOVQ CX, 208(AX) 2343 MOVQ $0x00000004, ret+24(FP) 2344 RET 2345 2346 // Return with overread error 2347 error_overread: 2348 MOVQ $0x00000006, ret+24(FP) 2349 RET 2350 2351 // Return with not enough output space error 2352 error_not_enough_space: 2353 MOVQ ctx+16(FP), AX 2354 MOVQ 24(SP), CX 2355 MOVQ CX, 208(AX) 2356 MOVQ 16(SP), CX 2357 MOVQ CX, 216(AX) 2358 MOVQ R12, 136(AX) 2359 MOVQ $0x00000005, ret+24(FP) 2360 RET 2361 2362 // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int 2363 // Requires: BMI, BMI2, CMOV, SSE 2364 TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 2365 MOVQ br+8(FP), CX 2366 MOVQ 32(CX), AX 2367 MOVBQZX 40(CX), DX 2368 MOVQ 24(CX), BX 2369 MOVQ (CX), CX 2370 ADDQ BX, CX 2371 MOVQ CX, (SP) 2372 MOVQ ctx+16(FP), CX 2373 MOVQ 72(CX), SI 2374 MOVQ 80(CX), DI 2375 MOVQ 88(CX), R8 2376 XORQ R9, R9 2377 MOVQ R9, 8(SP) 2378 MOVQ R9, 16(SP) 2379 MOVQ R9, 24(SP) 2380 MOVQ 112(CX), R9 2381 MOVQ 128(CX), R10 2382 MOVQ R10, 32(SP) 2383 MOVQ 144(CX), R10 2384 MOVQ 136(CX), R11 2385 MOVQ 200(CX), R12 2386 MOVQ R12, 56(SP) 2387 MOVQ 176(CX), R12 2388 MOVQ R12, 48(SP) 2389 MOVQ 184(CX), CX 2390 MOVQ CX, 40(SP) 2391 MOVQ 40(SP), CX 2392 ADDQ CX, 48(SP) 2393 2394 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) 2395 ADDQ R9, 32(SP) 2396 2397 // outBase += outPosition 2398 ADDQ R11, R9 2399 2400 sequenceDecs_decodeSync_bmi2_main_loop: 2401 MOVQ (SP), R12 2402 2403 // Fill bitreader to have enough for the offset and match length. 2404 CMPQ BX, $0x08 2405 JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte 2406 MOVQ DX, CX 2407 SHRQ $0x03, CX 2408 SUBQ CX, R12 2409 MOVQ (R12), AX 2410 SUBQ CX, BX 2411 ANDQ $0x07, DX 2412 JMP sequenceDecs_decodeSync_bmi2_fill_end 2413 2414 sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: 2415 CMPQ BX, $0x00 2416 JLE sequenceDecs_decodeSync_bmi2_fill_check_overread 2417 CMPQ DX, $0x07 2418 JLE sequenceDecs_decodeSync_bmi2_fill_end 2419 SHLQ $0x08, AX 2420 SUBQ $0x01, R12 2421 SUBQ $0x01, BX 2422 SUBQ $0x08, DX 2423 MOVBQZX (R12), CX 2424 ORQ CX, AX 2425 JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte 2426 2427 sequenceDecs_decodeSync_bmi2_fill_check_overread: 2428 CMPQ DX, $0x40 2429 JA error_overread 2430 2431 sequenceDecs_decodeSync_bmi2_fill_end: 2432 // Update offset 2433 MOVQ $0x00000808, CX 2434 BEXTRQ CX, R8, R13 2435 MOVQ AX, R14 2436 LEAQ (DX)(R13*1), CX 2437 ROLQ CL, R14 2438 BZHIQ R13, R14, R14 2439 MOVQ CX, DX 2440 MOVQ R8, CX 2441 SHRQ $0x20, CX 2442 ADDQ R14, CX 2443 MOVQ CX, 8(SP) 2444 2445 // Update match length 2446 MOVQ $0x00000808, CX 2447 BEXTRQ CX, DI, R13 2448 MOVQ AX, R14 2449 LEAQ (DX)(R13*1), CX 2450 ROLQ CL, R14 2451 BZHIQ R13, R14, R14 2452 MOVQ CX, DX 2453 MOVQ DI, CX 2454 SHRQ $0x20, CX 2455 ADDQ R14, CX 2456 MOVQ CX, 16(SP) 2457 2458 // Fill bitreader to have enough for the remaining 2459 CMPQ BX, $0x08 2460 JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte 2461 MOVQ DX, CX 2462 SHRQ $0x03, CX 2463 SUBQ CX, R12 2464 MOVQ (R12), AX 2465 SUBQ CX, BX 2466 ANDQ $0x07, DX 2467 JMP sequenceDecs_decodeSync_bmi2_fill_2_end 2468 2469 sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: 2470 CMPQ BX, $0x00 2471 JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread 2472 CMPQ DX, $0x07 2473 JLE sequenceDecs_decodeSync_bmi2_fill_2_end 2474 SHLQ $0x08, AX 2475 SUBQ $0x01, R12 2476 SUBQ $0x01, BX 2477 SUBQ $0x08, DX 2478 MOVBQZX (R12), CX 2479 ORQ CX, AX 2480 JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte 2481 2482 sequenceDecs_decodeSync_bmi2_fill_2_check_overread: 2483 CMPQ DX, $0x40 2484 JA error_overread 2485 2486 sequenceDecs_decodeSync_bmi2_fill_2_end: 2487 // Update literal length 2488 MOVQ $0x00000808, CX 2489 BEXTRQ CX, SI, R13 2490 MOVQ AX, R14 2491 LEAQ (DX)(R13*1), CX 2492 ROLQ CL, R14 2493 BZHIQ R13, R14, R14 2494 MOVQ CX, DX 2495 MOVQ SI, CX 2496 SHRQ $0x20, CX 2497 ADDQ R14, CX 2498 MOVQ CX, 24(SP) 2499 2500 // Fill bitreader for state updates 2501 MOVQ R12, (SP) 2502 MOVQ $0x00000808, CX 2503 BEXTRQ CX, R8, R12 2504 MOVQ ctx+16(FP), CX 2505 CMPQ 96(CX), $0x00 2506 JZ sequenceDecs_decodeSync_bmi2_skip_update 2507 LEAQ (SI)(DI*1), R13 2508 ADDQ R8, R13 2509 MOVBQZX R13, R13 2510 LEAQ (DX)(R13*1), CX 2511 MOVQ AX, R14 2512 MOVQ CX, DX 2513 ROLQ CL, R14 2514 BZHIQ R13, R14, R14 2515 2516 // Update Offset State 2517 BZHIQ R8, R14, CX 2518 SHRXQ R8, R14, R14 2519 MOVQ $0x00001010, R13 2520 BEXTRQ R13, R8, R8 2521 ADDQ CX, R8 2522 2523 // Load ctx.ofTable 2524 MOVQ ctx+16(FP), CX 2525 MOVQ 48(CX), CX 2526 MOVQ (CX)(R8*8), R8 2527 2528 // Update Match Length State 2529 BZHIQ DI, R14, CX 2530 SHRXQ DI, R14, R14 2531 MOVQ $0x00001010, R13 2532 BEXTRQ R13, DI, DI 2533 ADDQ CX, DI 2534 2535 // Load ctx.mlTable 2536 MOVQ ctx+16(FP), CX 2537 MOVQ 24(CX), CX 2538 MOVQ (CX)(DI*8), DI 2539 2540 // Update Literal Length State 2541 BZHIQ SI, R14, CX 2542 MOVQ $0x00001010, R13 2543 BEXTRQ R13, SI, SI 2544 ADDQ CX, SI 2545 2546 // Load ctx.llTable 2547 MOVQ ctx+16(FP), CX 2548 MOVQ (CX), CX 2549 MOVQ (CX)(SI*8), SI 2550 2551 sequenceDecs_decodeSync_bmi2_skip_update: 2552 // Adjust offset 2553 MOVQ s+0(FP), CX 2554 MOVQ 8(SP), R13 2555 CMPQ R12, $0x01 2556 JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 2557 MOVUPS 144(CX), X0 2558 MOVQ R13, 144(CX) 2559 MOVUPS X0, 152(CX) 2560 JMP sequenceDecs_decodeSync_bmi2_after_adjust 2561 2562 sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: 2563 CMPQ 24(SP), $0x00000000 2564 JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero 2565 INCQ R13 2566 JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero 2567 2568 sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: 2569 TESTQ R13, R13 2570 JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero 2571 MOVQ 144(CX), R13 2572 JMP sequenceDecs_decodeSync_bmi2_after_adjust 2573 2574 sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: 2575 MOVQ R13, R12 2576 XORQ R14, R14 2577 MOVQ $-1, R15 2578 CMPQ R13, $0x03 2579 CMOVQEQ R14, R12 2580 CMOVQEQ R15, R14 2581 ADDQ 144(CX)(R12*8), R14 2582 JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid 2583 MOVQ $0x00000001, R14 2584 2585 sequenceDecs_decodeSync_bmi2_adjust_temp_valid: 2586 CMPQ R13, $0x01 2587 JZ sequenceDecs_decodeSync_bmi2_adjust_skip 2588 MOVQ 152(CX), R12 2589 MOVQ R12, 160(CX) 2590 2591 sequenceDecs_decodeSync_bmi2_adjust_skip: 2592 MOVQ 144(CX), R12 2593 MOVQ R12, 152(CX) 2594 MOVQ R14, 144(CX) 2595 MOVQ R14, R13 2596 2597 sequenceDecs_decodeSync_bmi2_after_adjust: 2598 MOVQ R13, 8(SP) 2599 2600 // Check values 2601 MOVQ 16(SP), CX 2602 MOVQ 24(SP), R12 2603 LEAQ (CX)(R12*1), R14 2604 MOVQ s+0(FP), R15 2605 ADDQ R14, 256(R15) 2606 MOVQ ctx+16(FP), R14 2607 SUBQ R12, 104(R14) 2608 JS error_not_enough_literals 2609 CMPQ CX, $0x00020002 2610 JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big 2611 TESTQ R13, R13 2612 JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok 2613 TESTQ CX, CX 2614 JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch 2615 2616 sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: 2617 MOVQ 24(SP), CX 2618 MOVQ 8(SP), R12 2619 MOVQ 16(SP), R13 2620 2621 // Check if we have enough space in s.out 2622 LEAQ (CX)(R13*1), R14 2623 ADDQ R9, R14 2624 CMPQ R14, 32(SP) 2625 JA error_not_enough_space 2626 2627 // Copy literals 2628 TESTQ CX, CX 2629 JZ check_offset 2630 XORQ R14, R14 2631 2632 copy_1: 2633 MOVUPS (R10)(R14*1), X0 2634 MOVUPS X0, (R9)(R14*1) 2635 ADDQ $0x10, R14 2636 CMPQ R14, CX 2637 JB copy_1 2638 ADDQ CX, R10 2639 ADDQ CX, R9 2640 ADDQ CX, R11 2641 2642 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 2643 check_offset: 2644 MOVQ R11, CX 2645 ADDQ 40(SP), CX 2646 CMPQ R12, CX 2647 JG error_match_off_too_big 2648 CMPQ R12, 56(SP) 2649 JG error_match_off_too_big 2650 2651 // Copy match from history 2652 MOVQ R12, CX 2653 SUBQ R11, CX 2654 JLS copy_match 2655 MOVQ 48(SP), R14 2656 SUBQ CX, R14 2657 CMPQ R13, CX 2658 JG copy_all_from_history 2659 MOVQ R13, CX 2660 SUBQ $0x10, CX 2661 JB copy_4_small 2662 2663 copy_4_loop: 2664 MOVUPS (R14), X0 2665 MOVUPS X0, (R9) 2666 ADDQ $0x10, R14 2667 ADDQ $0x10, R9 2668 SUBQ $0x10, CX 2669 JAE copy_4_loop 2670 LEAQ 16(R14)(CX*1), R14 2671 LEAQ 16(R9)(CX*1), R9 2672 MOVUPS -16(R14), X0 2673 MOVUPS X0, -16(R9) 2674 JMP copy_4_end 2675 2676 copy_4_small: 2677 CMPQ R13, $0x03 2678 JE copy_4_move_3 2679 CMPQ R13, $0x08 2680 JB copy_4_move_4through7 2681 JMP copy_4_move_8through16 2682 2683 copy_4_move_3: 2684 MOVW (R14), CX 2685 MOVB 2(R14), R12 2686 MOVW CX, (R9) 2687 MOVB R12, 2(R9) 2688 ADDQ R13, R14 2689 ADDQ R13, R9 2690 JMP copy_4_end 2691 2692 copy_4_move_4through7: 2693 MOVL (R14), CX 2694 MOVL -4(R14)(R13*1), R12 2695 MOVL CX, (R9) 2696 MOVL R12, -4(R9)(R13*1) 2697 ADDQ R13, R14 2698 ADDQ R13, R9 2699 JMP copy_4_end 2700 2701 copy_4_move_8through16: 2702 MOVQ (R14), CX 2703 MOVQ -8(R14)(R13*1), R12 2704 MOVQ CX, (R9) 2705 MOVQ R12, -8(R9)(R13*1) 2706 ADDQ R13, R14 2707 ADDQ R13, R9 2708 2709 copy_4_end: 2710 ADDQ R13, R11 2711 JMP handle_loop 2712 JMP loop_finished 2713 2714 copy_all_from_history: 2715 MOVQ CX, R15 2716 SUBQ $0x10, R15 2717 JB copy_5_small 2718 2719 copy_5_loop: 2720 MOVUPS (R14), X0 2721 MOVUPS X0, (R9) 2722 ADDQ $0x10, R14 2723 ADDQ $0x10, R9 2724 SUBQ $0x10, R15 2725 JAE copy_5_loop 2726 LEAQ 16(R14)(R15*1), R14 2727 LEAQ 16(R9)(R15*1), R9 2728 MOVUPS -16(R14), X0 2729 MOVUPS X0, -16(R9) 2730 JMP copy_5_end 2731 2732 copy_5_small: 2733 CMPQ CX, $0x03 2734 JE copy_5_move_3 2735 JB copy_5_move_1or2 2736 CMPQ CX, $0x08 2737 JB copy_5_move_4through7 2738 JMP copy_5_move_8through16 2739 2740 copy_5_move_1or2: 2741 MOVB (R14), R15 2742 MOVB -1(R14)(CX*1), BP 2743 MOVB R15, (R9) 2744 MOVB BP, -1(R9)(CX*1) 2745 ADDQ CX, R14 2746 ADDQ CX, R9 2747 JMP copy_5_end 2748 2749 copy_5_move_3: 2750 MOVW (R14), R15 2751 MOVB 2(R14), BP 2752 MOVW R15, (R9) 2753 MOVB BP, 2(R9) 2754 ADDQ CX, R14 2755 ADDQ CX, R9 2756 JMP copy_5_end 2757 2758 copy_5_move_4through7: 2759 MOVL (R14), R15 2760 MOVL -4(R14)(CX*1), BP 2761 MOVL R15, (R9) 2762 MOVL BP, -4(R9)(CX*1) 2763 ADDQ CX, R14 2764 ADDQ CX, R9 2765 JMP copy_5_end 2766 2767 copy_5_move_8through16: 2768 MOVQ (R14), R15 2769 MOVQ -8(R14)(CX*1), BP 2770 MOVQ R15, (R9) 2771 MOVQ BP, -8(R9)(CX*1) 2772 ADDQ CX, R14 2773 ADDQ CX, R9 2774 2775 copy_5_end: 2776 ADDQ CX, R11 2777 SUBQ CX, R13 2778 2779 // Copy match from the current buffer 2780 copy_match: 2781 MOVQ R9, CX 2782 SUBQ R12, CX 2783 2784 // ml <= mo 2785 CMPQ R13, R12 2786 JA copy_overlapping_match 2787 2788 // Copy non-overlapping match 2789 ADDQ R13, R11 2790 MOVQ R9, R12 2791 ADDQ R13, R9 2792 2793 copy_2: 2794 MOVUPS (CX), X0 2795 MOVUPS X0, (R12) 2796 ADDQ $0x10, CX 2797 ADDQ $0x10, R12 2798 SUBQ $0x10, R13 2799 JHI copy_2 2800 JMP handle_loop 2801 2802 // Copy overlapping match 2803 copy_overlapping_match: 2804 ADDQ R13, R11 2805 2806 copy_slow_3: 2807 MOVB (CX), R12 2808 MOVB R12, (R9) 2809 INCQ CX 2810 INCQ R9 2811 DECQ R13 2812 JNZ copy_slow_3 2813 2814 handle_loop: 2815 MOVQ ctx+16(FP), CX 2816 DECQ 96(CX) 2817 JNS sequenceDecs_decodeSync_bmi2_main_loop 2818 2819 loop_finished: 2820 MOVQ br+8(FP), CX 2821 MOVQ AX, 32(CX) 2822 MOVB DL, 40(CX) 2823 MOVQ BX, 24(CX) 2824 2825 // Update the context 2826 MOVQ ctx+16(FP), AX 2827 MOVQ R11, 136(AX) 2828 MOVQ 144(AX), CX 2829 SUBQ CX, R10 2830 MOVQ R10, 168(AX) 2831 2832 // Return success 2833 MOVQ $0x00000000, ret+24(FP) 2834 RET 2835 2836 // Return with match length error 2837 sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: 2838 MOVQ 16(SP), AX 2839 MOVQ ctx+16(FP), CX 2840 MOVQ AX, 216(CX) 2841 MOVQ $0x00000001, ret+24(FP) 2842 RET 2843 2844 // Return with match too long error 2845 sequenceDecs_decodeSync_bmi2_error_match_len_too_big: 2846 MOVQ ctx+16(FP), AX 2847 MOVQ 16(SP), CX 2848 MOVQ CX, 216(AX) 2849 MOVQ $0x00000002, ret+24(FP) 2850 RET 2851 2852 // Return with match offset too long error 2853 error_match_off_too_big: 2854 MOVQ ctx+16(FP), AX 2855 MOVQ 8(SP), CX 2856 MOVQ CX, 224(AX) 2857 MOVQ R11, 136(AX) 2858 MOVQ $0x00000003, ret+24(FP) 2859 RET 2860 2861 // Return with not enough literals error 2862 error_not_enough_literals: 2863 MOVQ ctx+16(FP), AX 2864 MOVQ 24(SP), CX 2865 MOVQ CX, 208(AX) 2866 MOVQ $0x00000004, ret+24(FP) 2867 RET 2868 2869 // Return with overread error 2870 error_overread: 2871 MOVQ $0x00000006, ret+24(FP) 2872 RET 2873 2874 // Return with not enough output space error 2875 error_not_enough_space: 2876 MOVQ ctx+16(FP), AX 2877 MOVQ 24(SP), CX 2878 MOVQ CX, 208(AX) 2879 MOVQ 16(SP), CX 2880 MOVQ CX, 216(AX) 2881 MOVQ R11, 136(AX) 2882 MOVQ $0x00000005, ret+24(FP) 2883 RET 2884 2885 // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int 2886 // Requires: CMOV, SSE 2887 TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 2888 MOVQ br+8(FP), AX 2889 MOVQ 32(AX), DX 2890 MOVBQZX 40(AX), BX 2891 MOVQ 24(AX), SI 2892 MOVQ (AX), AX 2893 ADDQ SI, AX 2894 MOVQ AX, (SP) 2895 MOVQ ctx+16(FP), AX 2896 MOVQ 72(AX), DI 2897 MOVQ 80(AX), R8 2898 MOVQ 88(AX), R9 2899 XORQ CX, CX 2900 MOVQ CX, 8(SP) 2901 MOVQ CX, 16(SP) 2902 MOVQ CX, 24(SP) 2903 MOVQ 112(AX), R10 2904 MOVQ 128(AX), CX 2905 MOVQ CX, 32(SP) 2906 MOVQ 144(AX), R11 2907 MOVQ 136(AX), R12 2908 MOVQ 200(AX), CX 2909 MOVQ CX, 56(SP) 2910 MOVQ 176(AX), CX 2911 MOVQ CX, 48(SP) 2912 MOVQ 184(AX), AX 2913 MOVQ AX, 40(SP) 2914 MOVQ 40(SP), AX 2915 ADDQ AX, 48(SP) 2916 2917 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) 2918 ADDQ R10, 32(SP) 2919 2920 // outBase += outPosition 2921 ADDQ R12, R10 2922 2923 sequenceDecs_decodeSync_safe_amd64_main_loop: 2924 MOVQ (SP), R13 2925 2926 // Fill bitreader to have enough for the offset and match length. 2927 CMPQ SI, $0x08 2928 JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte 2929 MOVQ BX, AX 2930 SHRQ $0x03, AX 2931 SUBQ AX, R13 2932 MOVQ (R13), DX 2933 SUBQ AX, SI 2934 ANDQ $0x07, BX 2935 JMP sequenceDecs_decodeSync_safe_amd64_fill_end 2936 2937 sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: 2938 CMPQ SI, $0x00 2939 JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread 2940 CMPQ BX, $0x07 2941 JLE sequenceDecs_decodeSync_safe_amd64_fill_end 2942 SHLQ $0x08, DX 2943 SUBQ $0x01, R13 2944 SUBQ $0x01, SI 2945 SUBQ $0x08, BX 2946 MOVBQZX (R13), AX 2947 ORQ AX, DX 2948 JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte 2949 2950 sequenceDecs_decodeSync_safe_amd64_fill_check_overread: 2951 CMPQ BX, $0x40 2952 JA error_overread 2953 2954 sequenceDecs_decodeSync_safe_amd64_fill_end: 2955 // Update offset 2956 MOVQ R9, AX 2957 MOVQ BX, CX 2958 MOVQ DX, R14 2959 SHLQ CL, R14 2960 MOVB AH, CL 2961 SHRQ $0x20, AX 2962 TESTQ CX, CX 2963 JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero 2964 ADDQ CX, BX 2965 CMPQ BX, $0x40 2966 JA sequenceDecs_decodeSync_safe_amd64_of_update_zero 2967 CMPQ CX, $0x40 2968 JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero 2969 NEGQ CX 2970 SHRQ CL, R14 2971 ADDQ R14, AX 2972 2973 sequenceDecs_decodeSync_safe_amd64_of_update_zero: 2974 MOVQ AX, 8(SP) 2975 2976 // Update match length 2977 MOVQ R8, AX 2978 MOVQ BX, CX 2979 MOVQ DX, R14 2980 SHLQ CL, R14 2981 MOVB AH, CL 2982 SHRQ $0x20, AX 2983 TESTQ CX, CX 2984 JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero 2985 ADDQ CX, BX 2986 CMPQ BX, $0x40 2987 JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero 2988 CMPQ CX, $0x40 2989 JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero 2990 NEGQ CX 2991 SHRQ CL, R14 2992 ADDQ R14, AX 2993 2994 sequenceDecs_decodeSync_safe_amd64_ml_update_zero: 2995 MOVQ AX, 16(SP) 2996 2997 // Fill bitreader to have enough for the remaining 2998 CMPQ SI, $0x08 2999 JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte 3000 MOVQ BX, AX 3001 SHRQ $0x03, AX 3002 SUBQ AX, R13 3003 MOVQ (R13), DX 3004 SUBQ AX, SI 3005 ANDQ $0x07, BX 3006 JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end 3007 3008 sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: 3009 CMPQ SI, $0x00 3010 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread 3011 CMPQ BX, $0x07 3012 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end 3013 SHLQ $0x08, DX 3014 SUBQ $0x01, R13 3015 SUBQ $0x01, SI 3016 SUBQ $0x08, BX 3017 MOVBQZX (R13), AX 3018 ORQ AX, DX 3019 JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte 3020 3021 sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread: 3022 CMPQ BX, $0x40 3023 JA error_overread 3024 3025 sequenceDecs_decodeSync_safe_amd64_fill_2_end: 3026 // Update literal length 3027 MOVQ DI, AX 3028 MOVQ BX, CX 3029 MOVQ DX, R14 3030 SHLQ CL, R14 3031 MOVB AH, CL 3032 SHRQ $0x20, AX 3033 TESTQ CX, CX 3034 JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero 3035 ADDQ CX, BX 3036 CMPQ BX, $0x40 3037 JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero 3038 CMPQ CX, $0x40 3039 JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero 3040 NEGQ CX 3041 SHRQ CL, R14 3042 ADDQ R14, AX 3043 3044 sequenceDecs_decodeSync_safe_amd64_ll_update_zero: 3045 MOVQ AX, 24(SP) 3046 3047 // Fill bitreader for state updates 3048 MOVQ R13, (SP) 3049 MOVQ R9, AX 3050 SHRQ $0x08, AX 3051 MOVBQZX AL, AX 3052 MOVQ ctx+16(FP), CX 3053 CMPQ 96(CX), $0x00 3054 JZ sequenceDecs_decodeSync_safe_amd64_skip_update 3055 3056 // Update Literal Length State 3057 MOVBQZX DI, R13 3058 SHRQ $0x10, DI 3059 MOVWQZX DI, DI 3060 LEAQ (BX)(R13*1), CX 3061 MOVQ DX, R14 3062 MOVQ CX, BX 3063 ROLQ CL, R14 3064 MOVL $0x00000001, R15 3065 MOVB R13, CL 3066 SHLL CL, R15 3067 DECL R15 3068 ANDQ R15, R14 3069 ADDQ R14, DI 3070 3071 // Load ctx.llTable 3072 MOVQ ctx+16(FP), CX 3073 MOVQ (CX), CX 3074 MOVQ (CX)(DI*8), DI 3075 3076 // Update Match Length State 3077 MOVBQZX R8, R13 3078 SHRQ $0x10, R8 3079 MOVWQZX R8, R8 3080 LEAQ (BX)(R13*1), CX 3081 MOVQ DX, R14 3082 MOVQ CX, BX 3083 ROLQ CL, R14 3084 MOVL $0x00000001, R15 3085 MOVB R13, CL 3086 SHLL CL, R15 3087 DECL R15 3088 ANDQ R15, R14 3089 ADDQ R14, R8 3090 3091 // Load ctx.mlTable 3092 MOVQ ctx+16(FP), CX 3093 MOVQ 24(CX), CX 3094 MOVQ (CX)(R8*8), R8 3095 3096 // Update Offset State 3097 MOVBQZX R9, R13 3098 SHRQ $0x10, R9 3099 MOVWQZX R9, R9 3100 LEAQ (BX)(R13*1), CX 3101 MOVQ DX, R14 3102 MOVQ CX, BX 3103 ROLQ CL, R14 3104 MOVL $0x00000001, R15 3105 MOVB R13, CL 3106 SHLL CL, R15 3107 DECL R15 3108 ANDQ R15, R14 3109 ADDQ R14, R9 3110 3111 // Load ctx.ofTable 3112 MOVQ ctx+16(FP), CX 3113 MOVQ 48(CX), CX 3114 MOVQ (CX)(R9*8), R9 3115 3116 sequenceDecs_decodeSync_safe_amd64_skip_update: 3117 // Adjust offset 3118 MOVQ s+0(FP), CX 3119 MOVQ 8(SP), R13 3120 CMPQ AX, $0x01 3121 JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 3122 MOVUPS 144(CX), X0 3123 MOVQ R13, 144(CX) 3124 MOVUPS X0, 152(CX) 3125 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust 3126 3127 sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: 3128 CMPQ 24(SP), $0x00000000 3129 JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero 3130 INCQ R13 3131 JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero 3132 3133 sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: 3134 TESTQ R13, R13 3135 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero 3136 MOVQ 144(CX), R13 3137 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust 3138 3139 sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: 3140 MOVQ R13, AX 3141 XORQ R14, R14 3142 MOVQ $-1, R15 3143 CMPQ R13, $0x03 3144 CMOVQEQ R14, AX 3145 CMOVQEQ R15, R14 3146 ADDQ 144(CX)(AX*8), R14 3147 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid 3148 MOVQ $0x00000001, R14 3149 3150 sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: 3151 CMPQ R13, $0x01 3152 JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip 3153 MOVQ 152(CX), AX 3154 MOVQ AX, 160(CX) 3155 3156 sequenceDecs_decodeSync_safe_amd64_adjust_skip: 3157 MOVQ 144(CX), AX 3158 MOVQ AX, 152(CX) 3159 MOVQ R14, 144(CX) 3160 MOVQ R14, R13 3161 3162 sequenceDecs_decodeSync_safe_amd64_after_adjust: 3163 MOVQ R13, 8(SP) 3164 3165 // Check values 3166 MOVQ 16(SP), AX 3167 MOVQ 24(SP), CX 3168 LEAQ (AX)(CX*1), R14 3169 MOVQ s+0(FP), R15 3170 ADDQ R14, 256(R15) 3171 MOVQ ctx+16(FP), R14 3172 SUBQ CX, 104(R14) 3173 JS error_not_enough_literals 3174 CMPQ AX, $0x00020002 3175 JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big 3176 TESTQ R13, R13 3177 JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok 3178 TESTQ AX, AX 3179 JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch 3180 3181 sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: 3182 MOVQ 24(SP), AX 3183 MOVQ 8(SP), CX 3184 MOVQ 16(SP), R13 3185 3186 // Check if we have enough space in s.out 3187 LEAQ (AX)(R13*1), R14 3188 ADDQ R10, R14 3189 CMPQ R14, 32(SP) 3190 JA error_not_enough_space 3191 3192 // Copy literals 3193 TESTQ AX, AX 3194 JZ check_offset 3195 MOVQ AX, R14 3196 SUBQ $0x10, R14 3197 JB copy_1_small 3198 3199 copy_1_loop: 3200 MOVUPS (R11), X0 3201 MOVUPS X0, (R10) 3202 ADDQ $0x10, R11 3203 ADDQ $0x10, R10 3204 SUBQ $0x10, R14 3205 JAE copy_1_loop 3206 LEAQ 16(R11)(R14*1), R11 3207 LEAQ 16(R10)(R14*1), R10 3208 MOVUPS -16(R11), X0 3209 MOVUPS X0, -16(R10) 3210 JMP copy_1_end 3211 3212 copy_1_small: 3213 CMPQ AX, $0x03 3214 JE copy_1_move_3 3215 JB copy_1_move_1or2 3216 CMPQ AX, $0x08 3217 JB copy_1_move_4through7 3218 JMP copy_1_move_8through16 3219 3220 copy_1_move_1or2: 3221 MOVB (R11), R14 3222 MOVB -1(R11)(AX*1), R15 3223 MOVB R14, (R10) 3224 MOVB R15, -1(R10)(AX*1) 3225 ADDQ AX, R11 3226 ADDQ AX, R10 3227 JMP copy_1_end 3228 3229 copy_1_move_3: 3230 MOVW (R11), R14 3231 MOVB 2(R11), R15 3232 MOVW R14, (R10) 3233 MOVB R15, 2(R10) 3234 ADDQ AX, R11 3235 ADDQ AX, R10 3236 JMP copy_1_end 3237 3238 copy_1_move_4through7: 3239 MOVL (R11), R14 3240 MOVL -4(R11)(AX*1), R15 3241 MOVL R14, (R10) 3242 MOVL R15, -4(R10)(AX*1) 3243 ADDQ AX, R11 3244 ADDQ AX, R10 3245 JMP copy_1_end 3246 3247 copy_1_move_8through16: 3248 MOVQ (R11), R14 3249 MOVQ -8(R11)(AX*1), R15 3250 MOVQ R14, (R10) 3251 MOVQ R15, -8(R10)(AX*1) 3252 ADDQ AX, R11 3253 ADDQ AX, R10 3254 3255 copy_1_end: 3256 ADDQ AX, R12 3257 3258 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 3259 check_offset: 3260 MOVQ R12, AX 3261 ADDQ 40(SP), AX 3262 CMPQ CX, AX 3263 JG error_match_off_too_big 3264 CMPQ CX, 56(SP) 3265 JG error_match_off_too_big 3266 3267 // Copy match from history 3268 MOVQ CX, AX 3269 SUBQ R12, AX 3270 JLS copy_match 3271 MOVQ 48(SP), R14 3272 SUBQ AX, R14 3273 CMPQ R13, AX 3274 JG copy_all_from_history 3275 MOVQ R13, AX 3276 SUBQ $0x10, AX 3277 JB copy_4_small 3278 3279 copy_4_loop: 3280 MOVUPS (R14), X0 3281 MOVUPS X0, (R10) 3282 ADDQ $0x10, R14 3283 ADDQ $0x10, R10 3284 SUBQ $0x10, AX 3285 JAE copy_4_loop 3286 LEAQ 16(R14)(AX*1), R14 3287 LEAQ 16(R10)(AX*1), R10 3288 MOVUPS -16(R14), X0 3289 MOVUPS X0, -16(R10) 3290 JMP copy_4_end 3291 3292 copy_4_small: 3293 CMPQ R13, $0x03 3294 JE copy_4_move_3 3295 CMPQ R13, $0x08 3296 JB copy_4_move_4through7 3297 JMP copy_4_move_8through16 3298 3299 copy_4_move_3: 3300 MOVW (R14), AX 3301 MOVB 2(R14), CL 3302 MOVW AX, (R10) 3303 MOVB CL, 2(R10) 3304 ADDQ R13, R14 3305 ADDQ R13, R10 3306 JMP copy_4_end 3307 3308 copy_4_move_4through7: 3309 MOVL (R14), AX 3310 MOVL -4(R14)(R13*1), CX 3311 MOVL AX, (R10) 3312 MOVL CX, -4(R10)(R13*1) 3313 ADDQ R13, R14 3314 ADDQ R13, R10 3315 JMP copy_4_end 3316 3317 copy_4_move_8through16: 3318 MOVQ (R14), AX 3319 MOVQ -8(R14)(R13*1), CX 3320 MOVQ AX, (R10) 3321 MOVQ CX, -8(R10)(R13*1) 3322 ADDQ R13, R14 3323 ADDQ R13, R10 3324 3325 copy_4_end: 3326 ADDQ R13, R12 3327 JMP handle_loop 3328 JMP loop_finished 3329 3330 copy_all_from_history: 3331 MOVQ AX, R15 3332 SUBQ $0x10, R15 3333 JB copy_5_small 3334 3335 copy_5_loop: 3336 MOVUPS (R14), X0 3337 MOVUPS X0, (R10) 3338 ADDQ $0x10, R14 3339 ADDQ $0x10, R10 3340 SUBQ $0x10, R15 3341 JAE copy_5_loop 3342 LEAQ 16(R14)(R15*1), R14 3343 LEAQ 16(R10)(R15*1), R10 3344 MOVUPS -16(R14), X0 3345 MOVUPS X0, -16(R10) 3346 JMP copy_5_end 3347 3348 copy_5_small: 3349 CMPQ AX, $0x03 3350 JE copy_5_move_3 3351 JB copy_5_move_1or2 3352 CMPQ AX, $0x08 3353 JB copy_5_move_4through7 3354 JMP copy_5_move_8through16 3355 3356 copy_5_move_1or2: 3357 MOVB (R14), R15 3358 MOVB -1(R14)(AX*1), BP 3359 MOVB R15, (R10) 3360 MOVB BP, -1(R10)(AX*1) 3361 ADDQ AX, R14 3362 ADDQ AX, R10 3363 JMP copy_5_end 3364 3365 copy_5_move_3: 3366 MOVW (R14), R15 3367 MOVB 2(R14), BP 3368 MOVW R15, (R10) 3369 MOVB BP, 2(R10) 3370 ADDQ AX, R14 3371 ADDQ AX, R10 3372 JMP copy_5_end 3373 3374 copy_5_move_4through7: 3375 MOVL (R14), R15 3376 MOVL -4(R14)(AX*1), BP 3377 MOVL R15, (R10) 3378 MOVL BP, -4(R10)(AX*1) 3379 ADDQ AX, R14 3380 ADDQ AX, R10 3381 JMP copy_5_end 3382 3383 copy_5_move_8through16: 3384 MOVQ (R14), R15 3385 MOVQ -8(R14)(AX*1), BP 3386 MOVQ R15, (R10) 3387 MOVQ BP, -8(R10)(AX*1) 3388 ADDQ AX, R14 3389 ADDQ AX, R10 3390 3391 copy_5_end: 3392 ADDQ AX, R12 3393 SUBQ AX, R13 3394 3395 // Copy match from the current buffer 3396 copy_match: 3397 MOVQ R10, AX 3398 SUBQ CX, AX 3399 3400 // ml <= mo 3401 CMPQ R13, CX 3402 JA copy_overlapping_match 3403 3404 // Copy non-overlapping match 3405 ADDQ R13, R12 3406 MOVQ R13, CX 3407 SUBQ $0x10, CX 3408 JB copy_2_small 3409 3410 copy_2_loop: 3411 MOVUPS (AX), X0 3412 MOVUPS X0, (R10) 3413 ADDQ $0x10, AX 3414 ADDQ $0x10, R10 3415 SUBQ $0x10, CX 3416 JAE copy_2_loop 3417 LEAQ 16(AX)(CX*1), AX 3418 LEAQ 16(R10)(CX*1), R10 3419 MOVUPS -16(AX), X0 3420 MOVUPS X0, -16(R10) 3421 JMP copy_2_end 3422 3423 copy_2_small: 3424 CMPQ R13, $0x03 3425 JE copy_2_move_3 3426 JB copy_2_move_1or2 3427 CMPQ R13, $0x08 3428 JB copy_2_move_4through7 3429 JMP copy_2_move_8through16 3430 3431 copy_2_move_1or2: 3432 MOVB (AX), CL 3433 MOVB -1(AX)(R13*1), R14 3434 MOVB CL, (R10) 3435 MOVB R14, -1(R10)(R13*1) 3436 ADDQ R13, AX 3437 ADDQ R13, R10 3438 JMP copy_2_end 3439 3440 copy_2_move_3: 3441 MOVW (AX), CX 3442 MOVB 2(AX), R14 3443 MOVW CX, (R10) 3444 MOVB R14, 2(R10) 3445 ADDQ R13, AX 3446 ADDQ R13, R10 3447 JMP copy_2_end 3448 3449 copy_2_move_4through7: 3450 MOVL (AX), CX 3451 MOVL -4(AX)(R13*1), R14 3452 MOVL CX, (R10) 3453 MOVL R14, -4(R10)(R13*1) 3454 ADDQ R13, AX 3455 ADDQ R13, R10 3456 JMP copy_2_end 3457 3458 copy_2_move_8through16: 3459 MOVQ (AX), CX 3460 MOVQ -8(AX)(R13*1), R14 3461 MOVQ CX, (R10) 3462 MOVQ R14, -8(R10)(R13*1) 3463 ADDQ R13, AX 3464 ADDQ R13, R10 3465 3466 copy_2_end: 3467 JMP handle_loop 3468 3469 // Copy overlapping match 3470 copy_overlapping_match: 3471 ADDQ R13, R12 3472 3473 copy_slow_3: 3474 MOVB (AX), CL 3475 MOVB CL, (R10) 3476 INCQ AX 3477 INCQ R10 3478 DECQ R13 3479 JNZ copy_slow_3 3480 3481 handle_loop: 3482 MOVQ ctx+16(FP), AX 3483 DECQ 96(AX) 3484 JNS sequenceDecs_decodeSync_safe_amd64_main_loop 3485 3486 loop_finished: 3487 MOVQ br+8(FP), AX 3488 MOVQ DX, 32(AX) 3489 MOVB BL, 40(AX) 3490 MOVQ SI, 24(AX) 3491 3492 // Update the context 3493 MOVQ ctx+16(FP), AX 3494 MOVQ R12, 136(AX) 3495 MOVQ 144(AX), CX 3496 SUBQ CX, R11 3497 MOVQ R11, 168(AX) 3498 3499 // Return success 3500 MOVQ $0x00000000, ret+24(FP) 3501 RET 3502 3503 // Return with match length error 3504 sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: 3505 MOVQ 16(SP), AX 3506 MOVQ ctx+16(FP), CX 3507 MOVQ AX, 216(CX) 3508 MOVQ $0x00000001, ret+24(FP) 3509 RET 3510 3511 // Return with match too long error 3512 sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: 3513 MOVQ ctx+16(FP), AX 3514 MOVQ 16(SP), CX 3515 MOVQ CX, 216(AX) 3516 MOVQ $0x00000002, ret+24(FP) 3517 RET 3518 3519 // Return with match offset too long error 3520 error_match_off_too_big: 3521 MOVQ ctx+16(FP), AX 3522 MOVQ 8(SP), CX 3523 MOVQ CX, 224(AX) 3524 MOVQ R12, 136(AX) 3525 MOVQ $0x00000003, ret+24(FP) 3526 RET 3527 3528 // Return with not enough literals error 3529 error_not_enough_literals: 3530 MOVQ ctx+16(FP), AX 3531 MOVQ 24(SP), CX 3532 MOVQ CX, 208(AX) 3533 MOVQ $0x00000004, ret+24(FP) 3534 RET 3535 3536 // Return with overread error 3537 error_overread: 3538 MOVQ $0x00000006, ret+24(FP) 3539 RET 3540 3541 // Return with not enough output space error 3542 error_not_enough_space: 3543 MOVQ ctx+16(FP), AX 3544 MOVQ 24(SP), CX 3545 MOVQ CX, 208(AX) 3546 MOVQ 16(SP), CX 3547 MOVQ CX, 216(AX) 3548 MOVQ R12, 136(AX) 3549 MOVQ $0x00000005, ret+24(FP) 3550 RET 3551 3552 // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int 3553 // Requires: BMI, BMI2, CMOV, SSE 3554 TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 3555 MOVQ br+8(FP), CX 3556 MOVQ 32(CX), AX 3557 MOVBQZX 40(CX), DX 3558 MOVQ 24(CX), BX 3559 MOVQ (CX), CX 3560 ADDQ BX, CX 3561 MOVQ CX, (SP) 3562 MOVQ ctx+16(FP), CX 3563 MOVQ 72(CX), SI 3564 MOVQ 80(CX), DI 3565 MOVQ 88(CX), R8 3566 XORQ R9, R9 3567 MOVQ R9, 8(SP) 3568 MOVQ R9, 16(SP) 3569 MOVQ R9, 24(SP) 3570 MOVQ 112(CX), R9 3571 MOVQ 128(CX), R10 3572 MOVQ R10, 32(SP) 3573 MOVQ 144(CX), R10 3574 MOVQ 136(CX), R11 3575 MOVQ 200(CX), R12 3576 MOVQ R12, 56(SP) 3577 MOVQ 176(CX), R12 3578 MOVQ R12, 48(SP) 3579 MOVQ 184(CX), CX 3580 MOVQ CX, 40(SP) 3581 MOVQ 40(SP), CX 3582 ADDQ CX, 48(SP) 3583 3584 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) 3585 ADDQ R9, 32(SP) 3586 3587 // outBase += outPosition 3588 ADDQ R11, R9 3589 3590 sequenceDecs_decodeSync_safe_bmi2_main_loop: 3591 MOVQ (SP), R12 3592 3593 // Fill bitreader to have enough for the offset and match length. 3594 CMPQ BX, $0x08 3595 JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte 3596 MOVQ DX, CX 3597 SHRQ $0x03, CX 3598 SUBQ CX, R12 3599 MOVQ (R12), AX 3600 SUBQ CX, BX 3601 ANDQ $0x07, DX 3602 JMP sequenceDecs_decodeSync_safe_bmi2_fill_end 3603 3604 sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: 3605 CMPQ BX, $0x00 3606 JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread 3607 CMPQ DX, $0x07 3608 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end 3609 SHLQ $0x08, AX 3610 SUBQ $0x01, R12 3611 SUBQ $0x01, BX 3612 SUBQ $0x08, DX 3613 MOVBQZX (R12), CX 3614 ORQ CX, AX 3615 JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte 3616 3617 sequenceDecs_decodeSync_safe_bmi2_fill_check_overread: 3618 CMPQ DX, $0x40 3619 JA error_overread 3620 3621 sequenceDecs_decodeSync_safe_bmi2_fill_end: 3622 // Update offset 3623 MOVQ $0x00000808, CX 3624 BEXTRQ CX, R8, R13 3625 MOVQ AX, R14 3626 LEAQ (DX)(R13*1), CX 3627 ROLQ CL, R14 3628 BZHIQ R13, R14, R14 3629 MOVQ CX, DX 3630 MOVQ R8, CX 3631 SHRQ $0x20, CX 3632 ADDQ R14, CX 3633 MOVQ CX, 8(SP) 3634 3635 // Update match length 3636 MOVQ $0x00000808, CX 3637 BEXTRQ CX, DI, R13 3638 MOVQ AX, R14 3639 LEAQ (DX)(R13*1), CX 3640 ROLQ CL, R14 3641 BZHIQ R13, R14, R14 3642 MOVQ CX, DX 3643 MOVQ DI, CX 3644 SHRQ $0x20, CX 3645 ADDQ R14, CX 3646 MOVQ CX, 16(SP) 3647 3648 // Fill bitreader to have enough for the remaining 3649 CMPQ BX, $0x08 3650 JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte 3651 MOVQ DX, CX 3652 SHRQ $0x03, CX 3653 SUBQ CX, R12 3654 MOVQ (R12), AX 3655 SUBQ CX, BX 3656 ANDQ $0x07, DX 3657 JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end 3658 3659 sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: 3660 CMPQ BX, $0x00 3661 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread 3662 CMPQ DX, $0x07 3663 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end 3664 SHLQ $0x08, AX 3665 SUBQ $0x01, R12 3666 SUBQ $0x01, BX 3667 SUBQ $0x08, DX 3668 MOVBQZX (R12), CX 3669 ORQ CX, AX 3670 JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte 3671 3672 sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread: 3673 CMPQ DX, $0x40 3674 JA error_overread 3675 3676 sequenceDecs_decodeSync_safe_bmi2_fill_2_end: 3677 // Update literal length 3678 MOVQ $0x00000808, CX 3679 BEXTRQ CX, SI, R13 3680 MOVQ AX, R14 3681 LEAQ (DX)(R13*1), CX 3682 ROLQ CL, R14 3683 BZHIQ R13, R14, R14 3684 MOVQ CX, DX 3685 MOVQ SI, CX 3686 SHRQ $0x20, CX 3687 ADDQ R14, CX 3688 MOVQ CX, 24(SP) 3689 3690 // Fill bitreader for state updates 3691 MOVQ R12, (SP) 3692 MOVQ $0x00000808, CX 3693 BEXTRQ CX, R8, R12 3694 MOVQ ctx+16(FP), CX 3695 CMPQ 96(CX), $0x00 3696 JZ sequenceDecs_decodeSync_safe_bmi2_skip_update 3697 LEAQ (SI)(DI*1), R13 3698 ADDQ R8, R13 3699 MOVBQZX R13, R13 3700 LEAQ (DX)(R13*1), CX 3701 MOVQ AX, R14 3702 MOVQ CX, DX 3703 ROLQ CL, R14 3704 BZHIQ R13, R14, R14 3705 3706 // Update Offset State 3707 BZHIQ R8, R14, CX 3708 SHRXQ R8, R14, R14 3709 MOVQ $0x00001010, R13 3710 BEXTRQ R13, R8, R8 3711 ADDQ CX, R8 3712 3713 // Load ctx.ofTable 3714 MOVQ ctx+16(FP), CX 3715 MOVQ 48(CX), CX 3716 MOVQ (CX)(R8*8), R8 3717 3718 // Update Match Length State 3719 BZHIQ DI, R14, CX 3720 SHRXQ DI, R14, R14 3721 MOVQ $0x00001010, R13 3722 BEXTRQ R13, DI, DI 3723 ADDQ CX, DI 3724 3725 // Load ctx.mlTable 3726 MOVQ ctx+16(FP), CX 3727 MOVQ 24(CX), CX 3728 MOVQ (CX)(DI*8), DI 3729 3730 // Update Literal Length State 3731 BZHIQ SI, R14, CX 3732 MOVQ $0x00001010, R13 3733 BEXTRQ R13, SI, SI 3734 ADDQ CX, SI 3735 3736 // Load ctx.llTable 3737 MOVQ ctx+16(FP), CX 3738 MOVQ (CX), CX 3739 MOVQ (CX)(SI*8), SI 3740 3741 sequenceDecs_decodeSync_safe_bmi2_skip_update: 3742 // Adjust offset 3743 MOVQ s+0(FP), CX 3744 MOVQ 8(SP), R13 3745 CMPQ R12, $0x01 3746 JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 3747 MOVUPS 144(CX), X0 3748 MOVQ R13, 144(CX) 3749 MOVUPS X0, 152(CX) 3750 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust 3751 3752 sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: 3753 CMPQ 24(SP), $0x00000000 3754 JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero 3755 INCQ R13 3756 JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero 3757 3758 sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: 3759 TESTQ R13, R13 3760 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero 3761 MOVQ 144(CX), R13 3762 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust 3763 3764 sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: 3765 MOVQ R13, R12 3766 XORQ R14, R14 3767 MOVQ $-1, R15 3768 CMPQ R13, $0x03 3769 CMOVQEQ R14, R12 3770 CMOVQEQ R15, R14 3771 ADDQ 144(CX)(R12*8), R14 3772 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid 3773 MOVQ $0x00000001, R14 3774 3775 sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: 3776 CMPQ R13, $0x01 3777 JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip 3778 MOVQ 152(CX), R12 3779 MOVQ R12, 160(CX) 3780 3781 sequenceDecs_decodeSync_safe_bmi2_adjust_skip: 3782 MOVQ 144(CX), R12 3783 MOVQ R12, 152(CX) 3784 MOVQ R14, 144(CX) 3785 MOVQ R14, R13 3786 3787 sequenceDecs_decodeSync_safe_bmi2_after_adjust: 3788 MOVQ R13, 8(SP) 3789 3790 // Check values 3791 MOVQ 16(SP), CX 3792 MOVQ 24(SP), R12 3793 LEAQ (CX)(R12*1), R14 3794 MOVQ s+0(FP), R15 3795 ADDQ R14, 256(R15) 3796 MOVQ ctx+16(FP), R14 3797 SUBQ R12, 104(R14) 3798 JS error_not_enough_literals 3799 CMPQ CX, $0x00020002 3800 JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big 3801 TESTQ R13, R13 3802 JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok 3803 TESTQ CX, CX 3804 JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch 3805 3806 sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: 3807 MOVQ 24(SP), CX 3808 MOVQ 8(SP), R12 3809 MOVQ 16(SP), R13 3810 3811 // Check if we have enough space in s.out 3812 LEAQ (CX)(R13*1), R14 3813 ADDQ R9, R14 3814 CMPQ R14, 32(SP) 3815 JA error_not_enough_space 3816 3817 // Copy literals 3818 TESTQ CX, CX 3819 JZ check_offset 3820 MOVQ CX, R14 3821 SUBQ $0x10, R14 3822 JB copy_1_small 3823 3824 copy_1_loop: 3825 MOVUPS (R10), X0 3826 MOVUPS X0, (R9) 3827 ADDQ $0x10, R10 3828 ADDQ $0x10, R9 3829 SUBQ $0x10, R14 3830 JAE copy_1_loop 3831 LEAQ 16(R10)(R14*1), R10 3832 LEAQ 16(R9)(R14*1), R9 3833 MOVUPS -16(R10), X0 3834 MOVUPS X0, -16(R9) 3835 JMP copy_1_end 3836 3837 copy_1_small: 3838 CMPQ CX, $0x03 3839 JE copy_1_move_3 3840 JB copy_1_move_1or2 3841 CMPQ CX, $0x08 3842 JB copy_1_move_4through7 3843 JMP copy_1_move_8through16 3844 3845 copy_1_move_1or2: 3846 MOVB (R10), R14 3847 MOVB -1(R10)(CX*1), R15 3848 MOVB R14, (R9) 3849 MOVB R15, -1(R9)(CX*1) 3850 ADDQ CX, R10 3851 ADDQ CX, R9 3852 JMP copy_1_end 3853 3854 copy_1_move_3: 3855 MOVW (R10), R14 3856 MOVB 2(R10), R15 3857 MOVW R14, (R9) 3858 MOVB R15, 2(R9) 3859 ADDQ CX, R10 3860 ADDQ CX, R9 3861 JMP copy_1_end 3862 3863 copy_1_move_4through7: 3864 MOVL (R10), R14 3865 MOVL -4(R10)(CX*1), R15 3866 MOVL R14, (R9) 3867 MOVL R15, -4(R9)(CX*1) 3868 ADDQ CX, R10 3869 ADDQ CX, R9 3870 JMP copy_1_end 3871 3872 copy_1_move_8through16: 3873 MOVQ (R10), R14 3874 MOVQ -8(R10)(CX*1), R15 3875 MOVQ R14, (R9) 3876 MOVQ R15, -8(R9)(CX*1) 3877 ADDQ CX, R10 3878 ADDQ CX, R9 3879 3880 copy_1_end: 3881 ADDQ CX, R11 3882 3883 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 3884 check_offset: 3885 MOVQ R11, CX 3886 ADDQ 40(SP), CX 3887 CMPQ R12, CX 3888 JG error_match_off_too_big 3889 CMPQ R12, 56(SP) 3890 JG error_match_off_too_big 3891 3892 // Copy match from history 3893 MOVQ R12, CX 3894 SUBQ R11, CX 3895 JLS copy_match 3896 MOVQ 48(SP), R14 3897 SUBQ CX, R14 3898 CMPQ R13, CX 3899 JG copy_all_from_history 3900 MOVQ R13, CX 3901 SUBQ $0x10, CX 3902 JB copy_4_small 3903 3904 copy_4_loop: 3905 MOVUPS (R14), X0 3906 MOVUPS X0, (R9) 3907 ADDQ $0x10, R14 3908 ADDQ $0x10, R9 3909 SUBQ $0x10, CX 3910 JAE copy_4_loop 3911 LEAQ 16(R14)(CX*1), R14 3912 LEAQ 16(R9)(CX*1), R9 3913 MOVUPS -16(R14), X0 3914 MOVUPS X0, -16(R9) 3915 JMP copy_4_end 3916 3917 copy_4_small: 3918 CMPQ R13, $0x03 3919 JE copy_4_move_3 3920 CMPQ R13, $0x08 3921 JB copy_4_move_4through7 3922 JMP copy_4_move_8through16 3923 3924 copy_4_move_3: 3925 MOVW (R14), CX 3926 MOVB 2(R14), R12 3927 MOVW CX, (R9) 3928 MOVB R12, 2(R9) 3929 ADDQ R13, R14 3930 ADDQ R13, R9 3931 JMP copy_4_end 3932 3933 copy_4_move_4through7: 3934 MOVL (R14), CX 3935 MOVL -4(R14)(R13*1), R12 3936 MOVL CX, (R9) 3937 MOVL R12, -4(R9)(R13*1) 3938 ADDQ R13, R14 3939 ADDQ R13, R9 3940 JMP copy_4_end 3941 3942 copy_4_move_8through16: 3943 MOVQ (R14), CX 3944 MOVQ -8(R14)(R13*1), R12 3945 MOVQ CX, (R9) 3946 MOVQ R12, -8(R9)(R13*1) 3947 ADDQ R13, R14 3948 ADDQ R13, R9 3949 3950 copy_4_end: 3951 ADDQ R13, R11 3952 JMP handle_loop 3953 JMP loop_finished 3954 3955 copy_all_from_history: 3956 MOVQ CX, R15 3957 SUBQ $0x10, R15 3958 JB copy_5_small 3959 3960 copy_5_loop: 3961 MOVUPS (R14), X0 3962 MOVUPS X0, (R9) 3963 ADDQ $0x10, R14 3964 ADDQ $0x10, R9 3965 SUBQ $0x10, R15 3966 JAE copy_5_loop 3967 LEAQ 16(R14)(R15*1), R14 3968 LEAQ 16(R9)(R15*1), R9 3969 MOVUPS -16(R14), X0 3970 MOVUPS X0, -16(R9) 3971 JMP copy_5_end 3972 3973 copy_5_small: 3974 CMPQ CX, $0x03 3975 JE copy_5_move_3 3976 JB copy_5_move_1or2 3977 CMPQ CX, $0x08 3978 JB copy_5_move_4through7 3979 JMP copy_5_move_8through16 3980 3981 copy_5_move_1or2: 3982 MOVB (R14), R15 3983 MOVB -1(R14)(CX*1), BP 3984 MOVB R15, (R9) 3985 MOVB BP, -1(R9)(CX*1) 3986 ADDQ CX, R14 3987 ADDQ CX, R9 3988 JMP copy_5_end 3989 3990 copy_5_move_3: 3991 MOVW (R14), R15 3992 MOVB 2(R14), BP 3993 MOVW R15, (R9) 3994 MOVB BP, 2(R9) 3995 ADDQ CX, R14 3996 ADDQ CX, R9 3997 JMP copy_5_end 3998 3999 copy_5_move_4through7: 4000 MOVL (R14), R15 4001 MOVL -4(R14)(CX*1), BP 4002 MOVL R15, (R9) 4003 MOVL BP, -4(R9)(CX*1) 4004 ADDQ CX, R14 4005 ADDQ CX, R9 4006 JMP copy_5_end 4007 4008 copy_5_move_8through16: 4009 MOVQ (R14), R15 4010 MOVQ -8(R14)(CX*1), BP 4011 MOVQ R15, (R9) 4012 MOVQ BP, -8(R9)(CX*1) 4013 ADDQ CX, R14 4014 ADDQ CX, R9 4015 4016 copy_5_end: 4017 ADDQ CX, R11 4018 SUBQ CX, R13 4019 4020 // Copy match from the current buffer 4021 copy_match: 4022 MOVQ R9, CX 4023 SUBQ R12, CX 4024 4025 // ml <= mo 4026 CMPQ R13, R12 4027 JA copy_overlapping_match 4028 4029 // Copy non-overlapping match 4030 ADDQ R13, R11 4031 MOVQ R13, R12 4032 SUBQ $0x10, R12 4033 JB copy_2_small 4034 4035 copy_2_loop: 4036 MOVUPS (CX), X0 4037 MOVUPS X0, (R9) 4038 ADDQ $0x10, CX 4039 ADDQ $0x10, R9 4040 SUBQ $0x10, R12 4041 JAE copy_2_loop 4042 LEAQ 16(CX)(R12*1), CX 4043 LEAQ 16(R9)(R12*1), R9 4044 MOVUPS -16(CX), X0 4045 MOVUPS X0, -16(R9) 4046 JMP copy_2_end 4047 4048 copy_2_small: 4049 CMPQ R13, $0x03 4050 JE copy_2_move_3 4051 JB copy_2_move_1or2 4052 CMPQ R13, $0x08 4053 JB copy_2_move_4through7 4054 JMP copy_2_move_8through16 4055 4056 copy_2_move_1or2: 4057 MOVB (CX), R12 4058 MOVB -1(CX)(R13*1), R14 4059 MOVB R12, (R9) 4060 MOVB R14, -1(R9)(R13*1) 4061 ADDQ R13, CX 4062 ADDQ R13, R9 4063 JMP copy_2_end 4064 4065 copy_2_move_3: 4066 MOVW (CX), R12 4067 MOVB 2(CX), R14 4068 MOVW R12, (R9) 4069 MOVB R14, 2(R9) 4070 ADDQ R13, CX 4071 ADDQ R13, R9 4072 JMP copy_2_end 4073 4074 copy_2_move_4through7: 4075 MOVL (CX), R12 4076 MOVL -4(CX)(R13*1), R14 4077 MOVL R12, (R9) 4078 MOVL R14, -4(R9)(R13*1) 4079 ADDQ R13, CX 4080 ADDQ R13, R9 4081 JMP copy_2_end 4082 4083 copy_2_move_8through16: 4084 MOVQ (CX), R12 4085 MOVQ -8(CX)(R13*1), R14 4086 MOVQ R12, (R9) 4087 MOVQ R14, -8(R9)(R13*1) 4088 ADDQ R13, CX 4089 ADDQ R13, R9 4090 4091 copy_2_end: 4092 JMP handle_loop 4093 4094 // Copy overlapping match 4095 copy_overlapping_match: 4096 ADDQ R13, R11 4097 4098 copy_slow_3: 4099 MOVB (CX), R12 4100 MOVB R12, (R9) 4101 INCQ CX 4102 INCQ R9 4103 DECQ R13 4104 JNZ copy_slow_3 4105 4106 handle_loop: 4107 MOVQ ctx+16(FP), CX 4108 DECQ 96(CX) 4109 JNS sequenceDecs_decodeSync_safe_bmi2_main_loop 4110 4111 loop_finished: 4112 MOVQ br+8(FP), CX 4113 MOVQ AX, 32(CX) 4114 MOVB DL, 40(CX) 4115 MOVQ BX, 24(CX) 4116 4117 // Update the context 4118 MOVQ ctx+16(FP), AX 4119 MOVQ R11, 136(AX) 4120 MOVQ 144(AX), CX 4121 SUBQ CX, R10 4122 MOVQ R10, 168(AX) 4123 4124 // Return success 4125 MOVQ $0x00000000, ret+24(FP) 4126 RET 4127 4128 // Return with match length error 4129 sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: 4130 MOVQ 16(SP), AX 4131 MOVQ ctx+16(FP), CX 4132 MOVQ AX, 216(CX) 4133 MOVQ $0x00000001, ret+24(FP) 4134 RET 4135 4136 // Return with match too long error 4137 sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: 4138 MOVQ ctx+16(FP), AX 4139 MOVQ 16(SP), CX 4140 MOVQ CX, 216(AX) 4141 MOVQ $0x00000002, ret+24(FP) 4142 RET 4143 4144 // Return with match offset too long error 4145 error_match_off_too_big: 4146 MOVQ ctx+16(FP), AX 4147 MOVQ 8(SP), CX 4148 MOVQ CX, 224(AX) 4149 MOVQ R11, 136(AX) 4150 MOVQ $0x00000003, ret+24(FP) 4151 RET 4152 4153 // Return with not enough literals error 4154 error_not_enough_literals: 4155 MOVQ ctx+16(FP), AX 4156 MOVQ 24(SP), CX 4157 MOVQ CX, 208(AX) 4158 MOVQ $0x00000004, ret+24(FP) 4159 RET 4160 4161 // Return with overread error 4162 error_overread: 4163 MOVQ $0x00000006, ret+24(FP) 4164 RET 4165 4166 // Return with not enough output space error 4167 error_not_enough_space: 4168 MOVQ ctx+16(FP), AX 4169 MOVQ 24(SP), CX 4170 MOVQ CX, 208(AX) 4171 MOVQ 16(SP), CX 4172 MOVQ CX, 216(AX) 4173 MOVQ R11, 136(AX) 4174 MOVQ $0x00000005, ret+24(FP) 4175 RET