github.com/mdaxf/iac@v0.0.0-20240519030858-58a061660378/vendor_skip/github.com/klauspost/compress/zstd/seqdec_amd64.s (about) 1 // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. 2 3 //go:build !appengine && !noasm && gc && !noasm 4 5 // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int 6 // Requires: CMOV 7 TEXT ·sequenceDecs_decode_amd64(SB), $8-32 8 MOVQ br+8(FP), CX 9 MOVQ 24(CX), DX 10 MOVBQZX 32(CX), BX 11 MOVQ (CX), AX 12 MOVQ 8(CX), SI 13 ADDQ SI, AX 14 MOVQ AX, (SP) 15 MOVQ ctx+16(FP), AX 16 MOVQ 72(AX), DI 17 MOVQ 80(AX), R8 18 MOVQ 88(AX), R9 19 MOVQ 104(AX), R10 20 MOVQ s+0(FP), AX 21 MOVQ 144(AX), R11 22 MOVQ 152(AX), R12 23 MOVQ 160(AX), R13 24 25 sequenceDecs_decode_amd64_main_loop: 26 MOVQ (SP), R14 27 28 // Fill bitreader to have enough for the offset and match length. 29 CMPQ SI, $0x08 30 JL sequenceDecs_decode_amd64_fill_byte_by_byte 31 MOVQ BX, AX 32 SHRQ $0x03, AX 33 SUBQ AX, R14 34 MOVQ (R14), DX 35 SUBQ AX, SI 36 ANDQ $0x07, BX 37 JMP sequenceDecs_decode_amd64_fill_end 38 39 sequenceDecs_decode_amd64_fill_byte_by_byte: 40 CMPQ SI, $0x00 41 JLE sequenceDecs_decode_amd64_fill_check_overread 42 CMPQ BX, $0x07 43 JLE sequenceDecs_decode_amd64_fill_end 44 SHLQ $0x08, DX 45 SUBQ $0x01, R14 46 SUBQ $0x01, SI 47 SUBQ $0x08, BX 48 MOVBQZX (R14), AX 49 ORQ AX, DX 50 JMP sequenceDecs_decode_amd64_fill_byte_by_byte 51 52 sequenceDecs_decode_amd64_fill_check_overread: 53 CMPQ BX, $0x40 54 JA error_overread 55 56 sequenceDecs_decode_amd64_fill_end: 57 // Update offset 58 MOVQ R9, AX 59 MOVQ BX, CX 60 MOVQ DX, R15 61 SHLQ CL, R15 62 MOVB AH, CL 63 SHRQ $0x20, AX 64 TESTQ CX, CX 65 JZ sequenceDecs_decode_amd64_of_update_zero 66 ADDQ CX, BX 67 CMPQ BX, $0x40 68 JA sequenceDecs_decode_amd64_of_update_zero 69 CMPQ CX, $0x40 70 JAE sequenceDecs_decode_amd64_of_update_zero 71 NEGQ CX 72 SHRQ CL, R15 73 ADDQ R15, AX 74 75 sequenceDecs_decode_amd64_of_update_zero: 76 MOVQ AX, 16(R10) 77 78 // Update match length 79 MOVQ R8, AX 80 MOVQ BX, CX 81 MOVQ DX, R15 82 SHLQ CL, R15 83 MOVB AH, CL 84 SHRQ $0x20, AX 85 TESTQ CX, CX 86 JZ sequenceDecs_decode_amd64_ml_update_zero 87 ADDQ CX, BX 88 CMPQ BX, $0x40 89 JA sequenceDecs_decode_amd64_ml_update_zero 90 CMPQ CX, $0x40 91 JAE sequenceDecs_decode_amd64_ml_update_zero 92 NEGQ CX 93 SHRQ CL, R15 94 ADDQ R15, AX 95 96 sequenceDecs_decode_amd64_ml_update_zero: 97 MOVQ AX, 8(R10) 98 99 // Fill bitreader to have enough for the remaining 100 CMPQ SI, $0x08 101 JL sequenceDecs_decode_amd64_fill_2_byte_by_byte 102 MOVQ BX, AX 103 SHRQ $0x03, AX 104 SUBQ AX, R14 105 MOVQ (R14), DX 106 SUBQ AX, SI 107 ANDQ $0x07, BX 108 JMP sequenceDecs_decode_amd64_fill_2_end 109 110 sequenceDecs_decode_amd64_fill_2_byte_by_byte: 111 CMPQ SI, $0x00 112 JLE sequenceDecs_decode_amd64_fill_2_check_overread 113 CMPQ BX, $0x07 114 JLE sequenceDecs_decode_amd64_fill_2_end 115 SHLQ $0x08, DX 116 SUBQ $0x01, R14 117 SUBQ $0x01, SI 118 SUBQ $0x08, BX 119 MOVBQZX (R14), AX 120 ORQ AX, DX 121 JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte 122 123 sequenceDecs_decode_amd64_fill_2_check_overread: 124 CMPQ BX, $0x40 125 JA error_overread 126 127 sequenceDecs_decode_amd64_fill_2_end: 128 // Update literal length 129 MOVQ DI, AX 130 MOVQ BX, CX 131 MOVQ DX, R15 132 SHLQ CL, R15 133 MOVB AH, CL 134 SHRQ $0x20, AX 135 TESTQ CX, CX 136 JZ sequenceDecs_decode_amd64_ll_update_zero 137 ADDQ CX, BX 138 CMPQ BX, $0x40 139 JA sequenceDecs_decode_amd64_ll_update_zero 140 CMPQ CX, $0x40 141 JAE sequenceDecs_decode_amd64_ll_update_zero 142 NEGQ CX 143 SHRQ CL, R15 144 ADDQ R15, AX 145 146 sequenceDecs_decode_amd64_ll_update_zero: 147 MOVQ AX, (R10) 148 149 // Fill bitreader for state updates 150 MOVQ R14, (SP) 151 MOVQ R9, AX 152 SHRQ $0x08, AX 153 MOVBQZX AL, AX 154 MOVQ ctx+16(FP), CX 155 CMPQ 96(CX), $0x00 156 JZ sequenceDecs_decode_amd64_skip_update 157 158 // Update Literal Length State 159 MOVBQZX DI, R14 160 SHRL $0x10, DI 161 LEAQ (BX)(R14*1), CX 162 MOVQ DX, R15 163 MOVQ CX, BX 164 ROLQ CL, R15 165 MOVL $0x00000001, BP 166 MOVB R14, CL 167 SHLL CL, BP 168 DECL BP 169 ANDQ BP, R15 170 ADDQ R15, DI 171 172 // Load ctx.llTable 173 MOVQ ctx+16(FP), CX 174 MOVQ (CX), CX 175 MOVQ (CX)(DI*8), DI 176 177 // Update Match Length State 178 MOVBQZX R8, R14 179 SHRL $0x10, R8 180 LEAQ (BX)(R14*1), CX 181 MOVQ DX, R15 182 MOVQ CX, BX 183 ROLQ CL, R15 184 MOVL $0x00000001, BP 185 MOVB R14, CL 186 SHLL CL, BP 187 DECL BP 188 ANDQ BP, R15 189 ADDQ R15, R8 190 191 // Load ctx.mlTable 192 MOVQ ctx+16(FP), CX 193 MOVQ 24(CX), CX 194 MOVQ (CX)(R8*8), R8 195 196 // Update Offset State 197 MOVBQZX R9, R14 198 SHRL $0x10, R9 199 LEAQ (BX)(R14*1), CX 200 MOVQ DX, R15 201 MOVQ CX, BX 202 ROLQ CL, R15 203 MOVL $0x00000001, BP 204 MOVB R14, CL 205 SHLL CL, BP 206 DECL BP 207 ANDQ BP, R15 208 ADDQ R15, R9 209 210 // Load ctx.ofTable 211 MOVQ ctx+16(FP), CX 212 MOVQ 48(CX), CX 213 MOVQ (CX)(R9*8), R9 214 215 sequenceDecs_decode_amd64_skip_update: 216 // Adjust offset 217 MOVQ 16(R10), CX 218 CMPQ AX, $0x01 219 JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 220 MOVQ R12, R13 221 MOVQ R11, R12 222 MOVQ CX, R11 223 JMP sequenceDecs_decode_amd64_after_adjust 224 225 sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: 226 CMPQ (R10), $0x00000000 227 JNE sequenceDecs_decode_amd64_adjust_offset_maybezero 228 INCQ CX 229 JMP sequenceDecs_decode_amd64_adjust_offset_nonzero 230 231 sequenceDecs_decode_amd64_adjust_offset_maybezero: 232 TESTQ CX, CX 233 JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero 234 MOVQ R11, CX 235 JMP sequenceDecs_decode_amd64_after_adjust 236 237 sequenceDecs_decode_amd64_adjust_offset_nonzero: 238 CMPQ CX, $0x01 239 JB sequenceDecs_decode_amd64_adjust_zero 240 JEQ sequenceDecs_decode_amd64_adjust_one 241 CMPQ CX, $0x02 242 JA sequenceDecs_decode_amd64_adjust_three 243 JMP sequenceDecs_decode_amd64_adjust_two 244 245 sequenceDecs_decode_amd64_adjust_zero: 246 MOVQ R11, AX 247 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid 248 249 sequenceDecs_decode_amd64_adjust_one: 250 MOVQ R12, AX 251 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid 252 253 sequenceDecs_decode_amd64_adjust_two: 254 MOVQ R13, AX 255 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid 256 257 sequenceDecs_decode_amd64_adjust_three: 258 LEAQ -1(R11), AX 259 260 sequenceDecs_decode_amd64_adjust_test_temp_valid: 261 TESTQ AX, AX 262 JNZ sequenceDecs_decode_amd64_adjust_temp_valid 263 MOVQ $0x00000001, AX 264 265 sequenceDecs_decode_amd64_adjust_temp_valid: 266 CMPQ CX, $0x01 267 CMOVQNE R12, R13 268 MOVQ R11, R12 269 MOVQ AX, R11 270 MOVQ AX, CX 271 272 sequenceDecs_decode_amd64_after_adjust: 273 MOVQ CX, 16(R10) 274 275 // Check values 276 MOVQ 8(R10), AX 277 MOVQ (R10), R14 278 LEAQ (AX)(R14*1), R15 279 MOVQ s+0(FP), BP 280 ADDQ R15, 256(BP) 281 MOVQ ctx+16(FP), R15 282 SUBQ R14, 128(R15) 283 JS error_not_enough_literals 284 CMPQ AX, $0x00020002 285 JA sequenceDecs_decode_amd64_error_match_len_too_big 286 TESTQ CX, CX 287 JNZ sequenceDecs_decode_amd64_match_len_ofs_ok 288 TESTQ AX, AX 289 JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch 290 291 sequenceDecs_decode_amd64_match_len_ofs_ok: 292 ADDQ $0x18, R10 293 MOVQ ctx+16(FP), AX 294 DECQ 96(AX) 295 JNS sequenceDecs_decode_amd64_main_loop 296 MOVQ s+0(FP), AX 297 MOVQ R11, 144(AX) 298 MOVQ R12, 152(AX) 299 MOVQ R13, 160(AX) 300 MOVQ br+8(FP), AX 301 MOVQ DX, 24(AX) 302 MOVB BL, 32(AX) 303 MOVQ SI, 8(AX) 304 305 // Return success 306 MOVQ $0x00000000, ret+24(FP) 307 RET 308 309 // Return with match length error 310 sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: 311 MOVQ $0x00000001, ret+24(FP) 312 RET 313 314 // Return with match too long error 315 sequenceDecs_decode_amd64_error_match_len_too_big: 316 MOVQ $0x00000002, ret+24(FP) 317 RET 318 319 // Return with match offset too long error 320 MOVQ $0x00000003, ret+24(FP) 321 RET 322 323 // Return with not enough literals error 324 error_not_enough_literals: 325 MOVQ $0x00000004, ret+24(FP) 326 RET 327 328 // Return with overread error 329 error_overread: 330 MOVQ $0x00000006, ret+24(FP) 331 RET 332 333 // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int 334 // Requires: CMOV 335 TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 336 MOVQ br+8(FP), CX 337 MOVQ 24(CX), DX 338 MOVBQZX 32(CX), BX 339 MOVQ (CX), AX 340 MOVQ 8(CX), SI 341 ADDQ SI, AX 342 MOVQ AX, (SP) 343 MOVQ ctx+16(FP), AX 344 MOVQ 72(AX), DI 345 MOVQ 80(AX), R8 346 MOVQ 88(AX), R9 347 MOVQ 104(AX), R10 348 MOVQ s+0(FP), AX 349 MOVQ 144(AX), R11 350 MOVQ 152(AX), R12 351 MOVQ 160(AX), R13 352 353 sequenceDecs_decode_56_amd64_main_loop: 354 MOVQ (SP), R14 355 356 // Fill bitreader to have enough for the offset and match length. 357 CMPQ SI, $0x08 358 JL sequenceDecs_decode_56_amd64_fill_byte_by_byte 359 MOVQ BX, AX 360 SHRQ $0x03, AX 361 SUBQ AX, R14 362 MOVQ (R14), DX 363 SUBQ AX, SI 364 ANDQ $0x07, BX 365 JMP sequenceDecs_decode_56_amd64_fill_end 366 367 sequenceDecs_decode_56_amd64_fill_byte_by_byte: 368 CMPQ SI, $0x00 369 JLE sequenceDecs_decode_56_amd64_fill_check_overread 370 CMPQ BX, $0x07 371 JLE sequenceDecs_decode_56_amd64_fill_end 372 SHLQ $0x08, DX 373 SUBQ $0x01, R14 374 SUBQ $0x01, SI 375 SUBQ $0x08, BX 376 MOVBQZX (R14), AX 377 ORQ AX, DX 378 JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte 379 380 sequenceDecs_decode_56_amd64_fill_check_overread: 381 CMPQ BX, $0x40 382 JA error_overread 383 384 sequenceDecs_decode_56_amd64_fill_end: 385 // Update offset 386 MOVQ R9, AX 387 MOVQ BX, CX 388 MOVQ DX, R15 389 SHLQ CL, R15 390 MOVB AH, CL 391 SHRQ $0x20, AX 392 TESTQ CX, CX 393 JZ sequenceDecs_decode_56_amd64_of_update_zero 394 ADDQ CX, BX 395 CMPQ BX, $0x40 396 JA sequenceDecs_decode_56_amd64_of_update_zero 397 CMPQ CX, $0x40 398 JAE sequenceDecs_decode_56_amd64_of_update_zero 399 NEGQ CX 400 SHRQ CL, R15 401 ADDQ R15, AX 402 403 sequenceDecs_decode_56_amd64_of_update_zero: 404 MOVQ AX, 16(R10) 405 406 // Update match length 407 MOVQ R8, AX 408 MOVQ BX, CX 409 MOVQ DX, R15 410 SHLQ CL, R15 411 MOVB AH, CL 412 SHRQ $0x20, AX 413 TESTQ CX, CX 414 JZ sequenceDecs_decode_56_amd64_ml_update_zero 415 ADDQ CX, BX 416 CMPQ BX, $0x40 417 JA sequenceDecs_decode_56_amd64_ml_update_zero 418 CMPQ CX, $0x40 419 JAE sequenceDecs_decode_56_amd64_ml_update_zero 420 NEGQ CX 421 SHRQ CL, R15 422 ADDQ R15, AX 423 424 sequenceDecs_decode_56_amd64_ml_update_zero: 425 MOVQ AX, 8(R10) 426 427 // Update literal length 428 MOVQ DI, AX 429 MOVQ BX, CX 430 MOVQ DX, R15 431 SHLQ CL, R15 432 MOVB AH, CL 433 SHRQ $0x20, AX 434 TESTQ CX, CX 435 JZ sequenceDecs_decode_56_amd64_ll_update_zero 436 ADDQ CX, BX 437 CMPQ BX, $0x40 438 JA sequenceDecs_decode_56_amd64_ll_update_zero 439 CMPQ CX, $0x40 440 JAE sequenceDecs_decode_56_amd64_ll_update_zero 441 NEGQ CX 442 SHRQ CL, R15 443 ADDQ R15, AX 444 445 sequenceDecs_decode_56_amd64_ll_update_zero: 446 MOVQ AX, (R10) 447 448 // Fill bitreader for state updates 449 MOVQ R14, (SP) 450 MOVQ R9, AX 451 SHRQ $0x08, AX 452 MOVBQZX AL, AX 453 MOVQ ctx+16(FP), CX 454 CMPQ 96(CX), $0x00 455 JZ sequenceDecs_decode_56_amd64_skip_update 456 457 // Update Literal Length State 458 MOVBQZX DI, R14 459 SHRL $0x10, DI 460 LEAQ (BX)(R14*1), CX 461 MOVQ DX, R15 462 MOVQ CX, BX 463 ROLQ CL, R15 464 MOVL $0x00000001, BP 465 MOVB R14, CL 466 SHLL CL, BP 467 DECL BP 468 ANDQ BP, R15 469 ADDQ R15, DI 470 471 // Load ctx.llTable 472 MOVQ ctx+16(FP), CX 473 MOVQ (CX), CX 474 MOVQ (CX)(DI*8), DI 475 476 // Update Match Length State 477 MOVBQZX R8, R14 478 SHRL $0x10, R8 479 LEAQ (BX)(R14*1), CX 480 MOVQ DX, R15 481 MOVQ CX, BX 482 ROLQ CL, R15 483 MOVL $0x00000001, BP 484 MOVB R14, CL 485 SHLL CL, BP 486 DECL BP 487 ANDQ BP, R15 488 ADDQ R15, R8 489 490 // Load ctx.mlTable 491 MOVQ ctx+16(FP), CX 492 MOVQ 24(CX), CX 493 MOVQ (CX)(R8*8), R8 494 495 // Update Offset State 496 MOVBQZX R9, R14 497 SHRL $0x10, R9 498 LEAQ (BX)(R14*1), CX 499 MOVQ DX, R15 500 MOVQ CX, BX 501 ROLQ CL, R15 502 MOVL $0x00000001, BP 503 MOVB R14, CL 504 SHLL CL, BP 505 DECL BP 506 ANDQ BP, R15 507 ADDQ R15, R9 508 509 // Load ctx.ofTable 510 MOVQ ctx+16(FP), CX 511 MOVQ 48(CX), CX 512 MOVQ (CX)(R9*8), R9 513 514 sequenceDecs_decode_56_amd64_skip_update: 515 // Adjust offset 516 MOVQ 16(R10), CX 517 CMPQ AX, $0x01 518 JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 519 MOVQ R12, R13 520 MOVQ R11, R12 521 MOVQ CX, R11 522 JMP sequenceDecs_decode_56_amd64_after_adjust 523 524 sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: 525 CMPQ (R10), $0x00000000 526 JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero 527 INCQ CX 528 JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero 529 530 sequenceDecs_decode_56_amd64_adjust_offset_maybezero: 531 TESTQ CX, CX 532 JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero 533 MOVQ R11, CX 534 JMP sequenceDecs_decode_56_amd64_after_adjust 535 536 sequenceDecs_decode_56_amd64_adjust_offset_nonzero: 537 CMPQ CX, $0x01 538 JB sequenceDecs_decode_56_amd64_adjust_zero 539 JEQ sequenceDecs_decode_56_amd64_adjust_one 540 CMPQ CX, $0x02 541 JA sequenceDecs_decode_56_amd64_adjust_three 542 JMP sequenceDecs_decode_56_amd64_adjust_two 543 544 sequenceDecs_decode_56_amd64_adjust_zero: 545 MOVQ R11, AX 546 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid 547 548 sequenceDecs_decode_56_amd64_adjust_one: 549 MOVQ R12, AX 550 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid 551 552 sequenceDecs_decode_56_amd64_adjust_two: 553 MOVQ R13, AX 554 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid 555 556 sequenceDecs_decode_56_amd64_adjust_three: 557 LEAQ -1(R11), AX 558 559 sequenceDecs_decode_56_amd64_adjust_test_temp_valid: 560 TESTQ AX, AX 561 JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid 562 MOVQ $0x00000001, AX 563 564 sequenceDecs_decode_56_amd64_adjust_temp_valid: 565 CMPQ CX, $0x01 566 CMOVQNE R12, R13 567 MOVQ R11, R12 568 MOVQ AX, R11 569 MOVQ AX, CX 570 571 sequenceDecs_decode_56_amd64_after_adjust: 572 MOVQ CX, 16(R10) 573 574 // Check values 575 MOVQ 8(R10), AX 576 MOVQ (R10), R14 577 LEAQ (AX)(R14*1), R15 578 MOVQ s+0(FP), BP 579 ADDQ R15, 256(BP) 580 MOVQ ctx+16(FP), R15 581 SUBQ R14, 128(R15) 582 JS error_not_enough_literals 583 CMPQ AX, $0x00020002 584 JA sequenceDecs_decode_56_amd64_error_match_len_too_big 585 TESTQ CX, CX 586 JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok 587 TESTQ AX, AX 588 JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch 589 590 sequenceDecs_decode_56_amd64_match_len_ofs_ok: 591 ADDQ $0x18, R10 592 MOVQ ctx+16(FP), AX 593 DECQ 96(AX) 594 JNS sequenceDecs_decode_56_amd64_main_loop 595 MOVQ s+0(FP), AX 596 MOVQ R11, 144(AX) 597 MOVQ R12, 152(AX) 598 MOVQ R13, 160(AX) 599 MOVQ br+8(FP), AX 600 MOVQ DX, 24(AX) 601 MOVB BL, 32(AX) 602 MOVQ SI, 8(AX) 603 604 // Return success 605 MOVQ $0x00000000, ret+24(FP) 606 RET 607 608 // Return with match length error 609 sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: 610 MOVQ $0x00000001, ret+24(FP) 611 RET 612 613 // Return with match too long error 614 sequenceDecs_decode_56_amd64_error_match_len_too_big: 615 MOVQ $0x00000002, ret+24(FP) 616 RET 617 618 // Return with match offset too long error 619 MOVQ $0x00000003, ret+24(FP) 620 RET 621 622 // Return with not enough literals error 623 error_not_enough_literals: 624 MOVQ $0x00000004, ret+24(FP) 625 RET 626 627 // Return with overread error 628 error_overread: 629 MOVQ $0x00000006, ret+24(FP) 630 RET 631 632 // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int 633 // Requires: BMI, BMI2, CMOV 634 TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 635 MOVQ br+8(FP), BX 636 MOVQ 24(BX), AX 637 MOVBQZX 32(BX), DX 638 MOVQ (BX), CX 639 MOVQ 8(BX), BX 640 ADDQ BX, CX 641 MOVQ CX, (SP) 642 MOVQ ctx+16(FP), CX 643 MOVQ 72(CX), SI 644 MOVQ 80(CX), DI 645 MOVQ 88(CX), R8 646 MOVQ 104(CX), R9 647 MOVQ s+0(FP), CX 648 MOVQ 144(CX), R10 649 MOVQ 152(CX), R11 650 MOVQ 160(CX), R12 651 652 sequenceDecs_decode_bmi2_main_loop: 653 MOVQ (SP), R13 654 655 // Fill bitreader to have enough for the offset and match length. 656 CMPQ BX, $0x08 657 JL sequenceDecs_decode_bmi2_fill_byte_by_byte 658 MOVQ DX, CX 659 SHRQ $0x03, CX 660 SUBQ CX, R13 661 MOVQ (R13), AX 662 SUBQ CX, BX 663 ANDQ $0x07, DX 664 JMP sequenceDecs_decode_bmi2_fill_end 665 666 sequenceDecs_decode_bmi2_fill_byte_by_byte: 667 CMPQ BX, $0x00 668 JLE sequenceDecs_decode_bmi2_fill_check_overread 669 CMPQ DX, $0x07 670 JLE sequenceDecs_decode_bmi2_fill_end 671 SHLQ $0x08, AX 672 SUBQ $0x01, R13 673 SUBQ $0x01, BX 674 SUBQ $0x08, DX 675 MOVBQZX (R13), CX 676 ORQ CX, AX 677 JMP sequenceDecs_decode_bmi2_fill_byte_by_byte 678 679 sequenceDecs_decode_bmi2_fill_check_overread: 680 CMPQ DX, $0x40 681 JA error_overread 682 683 sequenceDecs_decode_bmi2_fill_end: 684 // Update offset 685 MOVQ $0x00000808, CX 686 BEXTRQ CX, R8, R14 687 MOVQ AX, R15 688 LEAQ (DX)(R14*1), CX 689 ROLQ CL, R15 690 BZHIQ R14, R15, R15 691 MOVQ CX, DX 692 MOVQ R8, CX 693 SHRQ $0x20, CX 694 ADDQ R15, CX 695 MOVQ CX, 16(R9) 696 697 // Update match length 698 MOVQ $0x00000808, CX 699 BEXTRQ CX, DI, R14 700 MOVQ AX, R15 701 LEAQ (DX)(R14*1), CX 702 ROLQ CL, R15 703 BZHIQ R14, R15, R15 704 MOVQ CX, DX 705 MOVQ DI, CX 706 SHRQ $0x20, CX 707 ADDQ R15, CX 708 MOVQ CX, 8(R9) 709 710 // Fill bitreader to have enough for the remaining 711 CMPQ BX, $0x08 712 JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte 713 MOVQ DX, CX 714 SHRQ $0x03, CX 715 SUBQ CX, R13 716 MOVQ (R13), AX 717 SUBQ CX, BX 718 ANDQ $0x07, DX 719 JMP sequenceDecs_decode_bmi2_fill_2_end 720 721 sequenceDecs_decode_bmi2_fill_2_byte_by_byte: 722 CMPQ BX, $0x00 723 JLE sequenceDecs_decode_bmi2_fill_2_check_overread 724 CMPQ DX, $0x07 725 JLE sequenceDecs_decode_bmi2_fill_2_end 726 SHLQ $0x08, AX 727 SUBQ $0x01, R13 728 SUBQ $0x01, BX 729 SUBQ $0x08, DX 730 MOVBQZX (R13), CX 731 ORQ CX, AX 732 JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte 733 734 sequenceDecs_decode_bmi2_fill_2_check_overread: 735 CMPQ DX, $0x40 736 JA error_overread 737 738 sequenceDecs_decode_bmi2_fill_2_end: 739 // Update literal length 740 MOVQ $0x00000808, CX 741 BEXTRQ CX, SI, R14 742 MOVQ AX, R15 743 LEAQ (DX)(R14*1), CX 744 ROLQ CL, R15 745 BZHIQ R14, R15, R15 746 MOVQ CX, DX 747 MOVQ SI, CX 748 SHRQ $0x20, CX 749 ADDQ R15, CX 750 MOVQ CX, (R9) 751 752 // Fill bitreader for state updates 753 MOVQ R13, (SP) 754 MOVQ $0x00000808, CX 755 BEXTRQ CX, R8, R13 756 MOVQ ctx+16(FP), CX 757 CMPQ 96(CX), $0x00 758 JZ sequenceDecs_decode_bmi2_skip_update 759 LEAQ (SI)(DI*1), R14 760 ADDQ R8, R14 761 MOVBQZX R14, R14 762 LEAQ (DX)(R14*1), CX 763 MOVQ AX, R15 764 MOVQ CX, DX 765 ROLQ CL, R15 766 BZHIQ R14, R15, R15 767 768 // Update Offset State 769 BZHIQ R8, R15, CX 770 SHRXQ R8, R15, R15 771 SHRL $0x10, R8 772 ADDQ CX, R8 773 774 // Load ctx.ofTable 775 MOVQ ctx+16(FP), CX 776 MOVQ 48(CX), CX 777 MOVQ (CX)(R8*8), R8 778 779 // Update Match Length State 780 BZHIQ DI, R15, CX 781 SHRXQ DI, R15, R15 782 SHRL $0x10, DI 783 ADDQ CX, DI 784 785 // Load ctx.mlTable 786 MOVQ ctx+16(FP), CX 787 MOVQ 24(CX), CX 788 MOVQ (CX)(DI*8), DI 789 790 // Update Literal Length State 791 BZHIQ SI, R15, CX 792 SHRL $0x10, SI 793 ADDQ CX, SI 794 795 // Load ctx.llTable 796 MOVQ ctx+16(FP), CX 797 MOVQ (CX), CX 798 MOVQ (CX)(SI*8), SI 799 800 sequenceDecs_decode_bmi2_skip_update: 801 // Adjust offset 802 MOVQ 16(R9), CX 803 CMPQ R13, $0x01 804 JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 805 MOVQ R11, R12 806 MOVQ R10, R11 807 MOVQ CX, R10 808 JMP sequenceDecs_decode_bmi2_after_adjust 809 810 sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: 811 CMPQ (R9), $0x00000000 812 JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero 813 INCQ CX 814 JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero 815 816 sequenceDecs_decode_bmi2_adjust_offset_maybezero: 817 TESTQ CX, CX 818 JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero 819 MOVQ R10, CX 820 JMP sequenceDecs_decode_bmi2_after_adjust 821 822 sequenceDecs_decode_bmi2_adjust_offset_nonzero: 823 CMPQ CX, $0x01 824 JB sequenceDecs_decode_bmi2_adjust_zero 825 JEQ sequenceDecs_decode_bmi2_adjust_one 826 CMPQ CX, $0x02 827 JA sequenceDecs_decode_bmi2_adjust_three 828 JMP sequenceDecs_decode_bmi2_adjust_two 829 830 sequenceDecs_decode_bmi2_adjust_zero: 831 MOVQ R10, R13 832 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid 833 834 sequenceDecs_decode_bmi2_adjust_one: 835 MOVQ R11, R13 836 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid 837 838 sequenceDecs_decode_bmi2_adjust_two: 839 MOVQ R12, R13 840 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid 841 842 sequenceDecs_decode_bmi2_adjust_three: 843 LEAQ -1(R10), R13 844 845 sequenceDecs_decode_bmi2_adjust_test_temp_valid: 846 TESTQ R13, R13 847 JNZ sequenceDecs_decode_bmi2_adjust_temp_valid 848 MOVQ $0x00000001, R13 849 850 sequenceDecs_decode_bmi2_adjust_temp_valid: 851 CMPQ CX, $0x01 852 CMOVQNE R11, R12 853 MOVQ R10, R11 854 MOVQ R13, R10 855 MOVQ R13, CX 856 857 sequenceDecs_decode_bmi2_after_adjust: 858 MOVQ CX, 16(R9) 859 860 // Check values 861 MOVQ 8(R9), R13 862 MOVQ (R9), R14 863 LEAQ (R13)(R14*1), R15 864 MOVQ s+0(FP), BP 865 ADDQ R15, 256(BP) 866 MOVQ ctx+16(FP), R15 867 SUBQ R14, 128(R15) 868 JS error_not_enough_literals 869 CMPQ R13, $0x00020002 870 JA sequenceDecs_decode_bmi2_error_match_len_too_big 871 TESTQ CX, CX 872 JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok 873 TESTQ R13, R13 874 JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch 875 876 sequenceDecs_decode_bmi2_match_len_ofs_ok: 877 ADDQ $0x18, R9 878 MOVQ ctx+16(FP), CX 879 DECQ 96(CX) 880 JNS sequenceDecs_decode_bmi2_main_loop 881 MOVQ s+0(FP), CX 882 MOVQ R10, 144(CX) 883 MOVQ R11, 152(CX) 884 MOVQ R12, 160(CX) 885 MOVQ br+8(FP), CX 886 MOVQ AX, 24(CX) 887 MOVB DL, 32(CX) 888 MOVQ BX, 8(CX) 889 890 // Return success 891 MOVQ $0x00000000, ret+24(FP) 892 RET 893 894 // Return with match length error 895 sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: 896 MOVQ $0x00000001, ret+24(FP) 897 RET 898 899 // Return with match too long error 900 sequenceDecs_decode_bmi2_error_match_len_too_big: 901 MOVQ $0x00000002, ret+24(FP) 902 RET 903 904 // Return with match offset too long error 905 MOVQ $0x00000003, ret+24(FP) 906 RET 907 908 // Return with not enough literals error 909 error_not_enough_literals: 910 MOVQ $0x00000004, ret+24(FP) 911 RET 912 913 // Return with overread error 914 error_overread: 915 MOVQ $0x00000006, ret+24(FP) 916 RET 917 918 // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int 919 // Requires: BMI, BMI2, CMOV 920 TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 921 MOVQ br+8(FP), BX 922 MOVQ 24(BX), AX 923 MOVBQZX 32(BX), DX 924 MOVQ (BX), CX 925 MOVQ 8(BX), BX 926 ADDQ BX, CX 927 MOVQ CX, (SP) 928 MOVQ ctx+16(FP), CX 929 MOVQ 72(CX), SI 930 MOVQ 80(CX), DI 931 MOVQ 88(CX), R8 932 MOVQ 104(CX), R9 933 MOVQ s+0(FP), CX 934 MOVQ 144(CX), R10 935 MOVQ 152(CX), R11 936 MOVQ 160(CX), R12 937 938 sequenceDecs_decode_56_bmi2_main_loop: 939 MOVQ (SP), R13 940 941 // Fill bitreader to have enough for the offset and match length. 942 CMPQ BX, $0x08 943 JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte 944 MOVQ DX, CX 945 SHRQ $0x03, CX 946 SUBQ CX, R13 947 MOVQ (R13), AX 948 SUBQ CX, BX 949 ANDQ $0x07, DX 950 JMP sequenceDecs_decode_56_bmi2_fill_end 951 952 sequenceDecs_decode_56_bmi2_fill_byte_by_byte: 953 CMPQ BX, $0x00 954 JLE sequenceDecs_decode_56_bmi2_fill_check_overread 955 CMPQ DX, $0x07 956 JLE sequenceDecs_decode_56_bmi2_fill_end 957 SHLQ $0x08, AX 958 SUBQ $0x01, R13 959 SUBQ $0x01, BX 960 SUBQ $0x08, DX 961 MOVBQZX (R13), CX 962 ORQ CX, AX 963 JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte 964 965 sequenceDecs_decode_56_bmi2_fill_check_overread: 966 CMPQ DX, $0x40 967 JA error_overread 968 969 sequenceDecs_decode_56_bmi2_fill_end: 970 // Update offset 971 MOVQ $0x00000808, CX 972 BEXTRQ CX, R8, R14 973 MOVQ AX, R15 974 LEAQ (DX)(R14*1), CX 975 ROLQ CL, R15 976 BZHIQ R14, R15, R15 977 MOVQ CX, DX 978 MOVQ R8, CX 979 SHRQ $0x20, CX 980 ADDQ R15, CX 981 MOVQ CX, 16(R9) 982 983 // Update match length 984 MOVQ $0x00000808, CX 985 BEXTRQ CX, DI, R14 986 MOVQ AX, R15 987 LEAQ (DX)(R14*1), CX 988 ROLQ CL, R15 989 BZHIQ R14, R15, R15 990 MOVQ CX, DX 991 MOVQ DI, CX 992 SHRQ $0x20, CX 993 ADDQ R15, CX 994 MOVQ CX, 8(R9) 995 996 // Update literal length 997 MOVQ $0x00000808, CX 998 BEXTRQ CX, SI, R14 999 MOVQ AX, R15 1000 LEAQ (DX)(R14*1), CX 1001 ROLQ CL, R15 1002 BZHIQ R14, R15, R15 1003 MOVQ CX, DX 1004 MOVQ SI, CX 1005 SHRQ $0x20, CX 1006 ADDQ R15, CX 1007 MOVQ CX, (R9) 1008 1009 // Fill bitreader for state updates 1010 MOVQ R13, (SP) 1011 MOVQ $0x00000808, CX 1012 BEXTRQ CX, R8, R13 1013 MOVQ ctx+16(FP), CX 1014 CMPQ 96(CX), $0x00 1015 JZ sequenceDecs_decode_56_bmi2_skip_update 1016 LEAQ (SI)(DI*1), R14 1017 ADDQ R8, R14 1018 MOVBQZX R14, R14 1019 LEAQ (DX)(R14*1), CX 1020 MOVQ AX, R15 1021 MOVQ CX, DX 1022 ROLQ CL, R15 1023 BZHIQ R14, R15, R15 1024 1025 // Update Offset State 1026 BZHIQ R8, R15, CX 1027 SHRXQ R8, R15, R15 1028 SHRL $0x10, R8 1029 ADDQ CX, R8 1030 1031 // Load ctx.ofTable 1032 MOVQ ctx+16(FP), CX 1033 MOVQ 48(CX), CX 1034 MOVQ (CX)(R8*8), R8 1035 1036 // Update Match Length State 1037 BZHIQ DI, R15, CX 1038 SHRXQ DI, R15, R15 1039 SHRL $0x10, DI 1040 ADDQ CX, DI 1041 1042 // Load ctx.mlTable 1043 MOVQ ctx+16(FP), CX 1044 MOVQ 24(CX), CX 1045 MOVQ (CX)(DI*8), DI 1046 1047 // Update Literal Length State 1048 BZHIQ SI, R15, CX 1049 SHRL $0x10, SI 1050 ADDQ CX, SI 1051 1052 // Load ctx.llTable 1053 MOVQ ctx+16(FP), CX 1054 MOVQ (CX), CX 1055 MOVQ (CX)(SI*8), SI 1056 1057 sequenceDecs_decode_56_bmi2_skip_update: 1058 // Adjust offset 1059 MOVQ 16(R9), CX 1060 CMPQ R13, $0x01 1061 JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 1062 MOVQ R11, R12 1063 MOVQ R10, R11 1064 MOVQ CX, R10 1065 JMP sequenceDecs_decode_56_bmi2_after_adjust 1066 1067 sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: 1068 CMPQ (R9), $0x00000000 1069 JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero 1070 INCQ CX 1071 JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero 1072 1073 sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: 1074 TESTQ CX, CX 1075 JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero 1076 MOVQ R10, CX 1077 JMP sequenceDecs_decode_56_bmi2_after_adjust 1078 1079 sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: 1080 CMPQ CX, $0x01 1081 JB sequenceDecs_decode_56_bmi2_adjust_zero 1082 JEQ sequenceDecs_decode_56_bmi2_adjust_one 1083 CMPQ CX, $0x02 1084 JA sequenceDecs_decode_56_bmi2_adjust_three 1085 JMP sequenceDecs_decode_56_bmi2_adjust_two 1086 1087 sequenceDecs_decode_56_bmi2_adjust_zero: 1088 MOVQ R10, R13 1089 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid 1090 1091 sequenceDecs_decode_56_bmi2_adjust_one: 1092 MOVQ R11, R13 1093 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid 1094 1095 sequenceDecs_decode_56_bmi2_adjust_two: 1096 MOVQ R12, R13 1097 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid 1098 1099 sequenceDecs_decode_56_bmi2_adjust_three: 1100 LEAQ -1(R10), R13 1101 1102 sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: 1103 TESTQ R13, R13 1104 JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid 1105 MOVQ $0x00000001, R13 1106 1107 sequenceDecs_decode_56_bmi2_adjust_temp_valid: 1108 CMPQ CX, $0x01 1109 CMOVQNE R11, R12 1110 MOVQ R10, R11 1111 MOVQ R13, R10 1112 MOVQ R13, CX 1113 1114 sequenceDecs_decode_56_bmi2_after_adjust: 1115 MOVQ CX, 16(R9) 1116 1117 // Check values 1118 MOVQ 8(R9), R13 1119 MOVQ (R9), R14 1120 LEAQ (R13)(R14*1), R15 1121 MOVQ s+0(FP), BP 1122 ADDQ R15, 256(BP) 1123 MOVQ ctx+16(FP), R15 1124 SUBQ R14, 128(R15) 1125 JS error_not_enough_literals 1126 CMPQ R13, $0x00020002 1127 JA sequenceDecs_decode_56_bmi2_error_match_len_too_big 1128 TESTQ CX, CX 1129 JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok 1130 TESTQ R13, R13 1131 JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch 1132 1133 sequenceDecs_decode_56_bmi2_match_len_ofs_ok: 1134 ADDQ $0x18, R9 1135 MOVQ ctx+16(FP), CX 1136 DECQ 96(CX) 1137 JNS sequenceDecs_decode_56_bmi2_main_loop 1138 MOVQ s+0(FP), CX 1139 MOVQ R10, 144(CX) 1140 MOVQ R11, 152(CX) 1141 MOVQ R12, 160(CX) 1142 MOVQ br+8(FP), CX 1143 MOVQ AX, 24(CX) 1144 MOVB DL, 32(CX) 1145 MOVQ BX, 8(CX) 1146 1147 // Return success 1148 MOVQ $0x00000000, ret+24(FP) 1149 RET 1150 1151 // Return with match length error 1152 sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: 1153 MOVQ $0x00000001, ret+24(FP) 1154 RET 1155 1156 // Return with match too long error 1157 sequenceDecs_decode_56_bmi2_error_match_len_too_big: 1158 MOVQ $0x00000002, ret+24(FP) 1159 RET 1160 1161 // Return with match offset too long error 1162 MOVQ $0x00000003, ret+24(FP) 1163 RET 1164 1165 // Return with not enough literals error 1166 error_not_enough_literals: 1167 MOVQ $0x00000004, ret+24(FP) 1168 RET 1169 1170 // Return with overread error 1171 error_overread: 1172 MOVQ $0x00000006, ret+24(FP) 1173 RET 1174 1175 // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool 1176 // Requires: SSE 1177 TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 1178 MOVQ ctx+0(FP), R10 1179 MOVQ 8(R10), CX 1180 TESTQ CX, CX 1181 JZ empty_seqs 1182 MOVQ (R10), AX 1183 MOVQ 24(R10), DX 1184 MOVQ 32(R10), BX 1185 MOVQ 80(R10), SI 1186 MOVQ 104(R10), DI 1187 MOVQ 120(R10), R8 1188 MOVQ 56(R10), R9 1189 MOVQ 64(R10), R10 1190 ADDQ R10, R9 1191 1192 // seqsBase += 24 * seqIndex 1193 LEAQ (DX)(DX*2), R11 1194 SHLQ $0x03, R11 1195 ADDQ R11, AX 1196 1197 // outBase += outPosition 1198 ADDQ DI, BX 1199 1200 main_loop: 1201 MOVQ (AX), R11 1202 MOVQ 16(AX), R12 1203 MOVQ 8(AX), R13 1204 1205 // Copy literals 1206 TESTQ R11, R11 1207 JZ check_offset 1208 XORQ R14, R14 1209 1210 copy_1: 1211 MOVUPS (SI)(R14*1), X0 1212 MOVUPS X0, (BX)(R14*1) 1213 ADDQ $0x10, R14 1214 CMPQ R14, R11 1215 JB copy_1 1216 ADDQ R11, SI 1217 ADDQ R11, BX 1218 ADDQ R11, DI 1219 1220 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 1221 check_offset: 1222 LEAQ (DI)(R10*1), R11 1223 CMPQ R12, R11 1224 JG error_match_off_too_big 1225 CMPQ R12, R8 1226 JG error_match_off_too_big 1227 1228 // Copy match from history 1229 MOVQ R12, R11 1230 SUBQ DI, R11 1231 JLS copy_match 1232 MOVQ R9, R14 1233 SUBQ R11, R14 1234 CMPQ R13, R11 1235 JG copy_all_from_history 1236 MOVQ R13, R11 1237 SUBQ $0x10, R11 1238 JB copy_4_small 1239 1240 copy_4_loop: 1241 MOVUPS (R14), X0 1242 MOVUPS X0, (BX) 1243 ADDQ $0x10, R14 1244 ADDQ $0x10, BX 1245 SUBQ $0x10, R11 1246 JAE copy_4_loop 1247 LEAQ 16(R14)(R11*1), R14 1248 LEAQ 16(BX)(R11*1), BX 1249 MOVUPS -16(R14), X0 1250 MOVUPS X0, -16(BX) 1251 JMP copy_4_end 1252 1253 copy_4_small: 1254 CMPQ R13, $0x03 1255 JE copy_4_move_3 1256 CMPQ R13, $0x08 1257 JB copy_4_move_4through7 1258 JMP copy_4_move_8through16 1259 1260 copy_4_move_3: 1261 MOVW (R14), R11 1262 MOVB 2(R14), R12 1263 MOVW R11, (BX) 1264 MOVB R12, 2(BX) 1265 ADDQ R13, R14 1266 ADDQ R13, BX 1267 JMP copy_4_end 1268 1269 copy_4_move_4through7: 1270 MOVL (R14), R11 1271 MOVL -4(R14)(R13*1), R12 1272 MOVL R11, (BX) 1273 MOVL R12, -4(BX)(R13*1) 1274 ADDQ R13, R14 1275 ADDQ R13, BX 1276 JMP copy_4_end 1277 1278 copy_4_move_8through16: 1279 MOVQ (R14), R11 1280 MOVQ -8(R14)(R13*1), R12 1281 MOVQ R11, (BX) 1282 MOVQ R12, -8(BX)(R13*1) 1283 ADDQ R13, R14 1284 ADDQ R13, BX 1285 1286 copy_4_end: 1287 ADDQ R13, DI 1288 ADDQ $0x18, AX 1289 INCQ DX 1290 CMPQ DX, CX 1291 JB main_loop 1292 JMP loop_finished 1293 1294 copy_all_from_history: 1295 MOVQ R11, R15 1296 SUBQ $0x10, R15 1297 JB copy_5_small 1298 1299 copy_5_loop: 1300 MOVUPS (R14), X0 1301 MOVUPS X0, (BX) 1302 ADDQ $0x10, R14 1303 ADDQ $0x10, BX 1304 SUBQ $0x10, R15 1305 JAE copy_5_loop 1306 LEAQ 16(R14)(R15*1), R14 1307 LEAQ 16(BX)(R15*1), BX 1308 MOVUPS -16(R14), X0 1309 MOVUPS X0, -16(BX) 1310 JMP copy_5_end 1311 1312 copy_5_small: 1313 CMPQ R11, $0x03 1314 JE copy_5_move_3 1315 JB copy_5_move_1or2 1316 CMPQ R11, $0x08 1317 JB copy_5_move_4through7 1318 JMP copy_5_move_8through16 1319 1320 copy_5_move_1or2: 1321 MOVB (R14), R15 1322 MOVB -1(R14)(R11*1), BP 1323 MOVB R15, (BX) 1324 MOVB BP, -1(BX)(R11*1) 1325 ADDQ R11, R14 1326 ADDQ R11, BX 1327 JMP copy_5_end 1328 1329 copy_5_move_3: 1330 MOVW (R14), R15 1331 MOVB 2(R14), BP 1332 MOVW R15, (BX) 1333 MOVB BP, 2(BX) 1334 ADDQ R11, R14 1335 ADDQ R11, BX 1336 JMP copy_5_end 1337 1338 copy_5_move_4through7: 1339 MOVL (R14), R15 1340 MOVL -4(R14)(R11*1), BP 1341 MOVL R15, (BX) 1342 MOVL BP, -4(BX)(R11*1) 1343 ADDQ R11, R14 1344 ADDQ R11, BX 1345 JMP copy_5_end 1346 1347 copy_5_move_8through16: 1348 MOVQ (R14), R15 1349 MOVQ -8(R14)(R11*1), BP 1350 MOVQ R15, (BX) 1351 MOVQ BP, -8(BX)(R11*1) 1352 ADDQ R11, R14 1353 ADDQ R11, BX 1354 1355 copy_5_end: 1356 ADDQ R11, DI 1357 SUBQ R11, R13 1358 1359 // Copy match from the current buffer 1360 copy_match: 1361 MOVQ BX, R11 1362 SUBQ R12, R11 1363 1364 // ml <= mo 1365 CMPQ R13, R12 1366 JA copy_overlapping_match 1367 1368 // Copy non-overlapping match 1369 ADDQ R13, DI 1370 MOVQ BX, R12 1371 ADDQ R13, BX 1372 1373 copy_2: 1374 MOVUPS (R11), X0 1375 MOVUPS X0, (R12) 1376 ADDQ $0x10, R11 1377 ADDQ $0x10, R12 1378 SUBQ $0x10, R13 1379 JHI copy_2 1380 JMP handle_loop 1381 1382 // Copy overlapping match 1383 copy_overlapping_match: 1384 ADDQ R13, DI 1385 1386 copy_slow_3: 1387 MOVB (R11), R12 1388 MOVB R12, (BX) 1389 INCQ R11 1390 INCQ BX 1391 DECQ R13 1392 JNZ copy_slow_3 1393 1394 handle_loop: 1395 ADDQ $0x18, AX 1396 INCQ DX 1397 CMPQ DX, CX 1398 JB main_loop 1399 1400 loop_finished: 1401 // Return value 1402 MOVB $0x01, ret+8(FP) 1403 1404 // Update the context 1405 MOVQ ctx+0(FP), AX 1406 MOVQ DX, 24(AX) 1407 MOVQ DI, 104(AX) 1408 SUBQ 80(AX), SI 1409 MOVQ SI, 112(AX) 1410 RET 1411 1412 error_match_off_too_big: 1413 // Return value 1414 MOVB $0x00, ret+8(FP) 1415 1416 // Update the context 1417 MOVQ ctx+0(FP), AX 1418 MOVQ DX, 24(AX) 1419 MOVQ DI, 104(AX) 1420 SUBQ 80(AX), SI 1421 MOVQ SI, 112(AX) 1422 RET 1423 1424 empty_seqs: 1425 // Return value 1426 MOVB $0x01, ret+8(FP) 1427 RET 1428 1429 // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool 1430 // Requires: SSE 1431 TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 1432 MOVQ ctx+0(FP), R10 1433 MOVQ 8(R10), CX 1434 TESTQ CX, CX 1435 JZ empty_seqs 1436 MOVQ (R10), AX 1437 MOVQ 24(R10), DX 1438 MOVQ 32(R10), BX 1439 MOVQ 80(R10), SI 1440 MOVQ 104(R10), DI 1441 MOVQ 120(R10), R8 1442 MOVQ 56(R10), R9 1443 MOVQ 64(R10), R10 1444 ADDQ R10, R9 1445 1446 // seqsBase += 24 * seqIndex 1447 LEAQ (DX)(DX*2), R11 1448 SHLQ $0x03, R11 1449 ADDQ R11, AX 1450 1451 // outBase += outPosition 1452 ADDQ DI, BX 1453 1454 main_loop: 1455 MOVQ (AX), R11 1456 MOVQ 16(AX), R12 1457 MOVQ 8(AX), R13 1458 1459 // Copy literals 1460 TESTQ R11, R11 1461 JZ check_offset 1462 MOVQ R11, R14 1463 SUBQ $0x10, R14 1464 JB copy_1_small 1465 1466 copy_1_loop: 1467 MOVUPS (SI), X0 1468 MOVUPS X0, (BX) 1469 ADDQ $0x10, SI 1470 ADDQ $0x10, BX 1471 SUBQ $0x10, R14 1472 JAE copy_1_loop 1473 LEAQ 16(SI)(R14*1), SI 1474 LEAQ 16(BX)(R14*1), BX 1475 MOVUPS -16(SI), X0 1476 MOVUPS X0, -16(BX) 1477 JMP copy_1_end 1478 1479 copy_1_small: 1480 CMPQ R11, $0x03 1481 JE copy_1_move_3 1482 JB copy_1_move_1or2 1483 CMPQ R11, $0x08 1484 JB copy_1_move_4through7 1485 JMP copy_1_move_8through16 1486 1487 copy_1_move_1or2: 1488 MOVB (SI), R14 1489 MOVB -1(SI)(R11*1), R15 1490 MOVB R14, (BX) 1491 MOVB R15, -1(BX)(R11*1) 1492 ADDQ R11, SI 1493 ADDQ R11, BX 1494 JMP copy_1_end 1495 1496 copy_1_move_3: 1497 MOVW (SI), R14 1498 MOVB 2(SI), R15 1499 MOVW R14, (BX) 1500 MOVB R15, 2(BX) 1501 ADDQ R11, SI 1502 ADDQ R11, BX 1503 JMP copy_1_end 1504 1505 copy_1_move_4through7: 1506 MOVL (SI), R14 1507 MOVL -4(SI)(R11*1), R15 1508 MOVL R14, (BX) 1509 MOVL R15, -4(BX)(R11*1) 1510 ADDQ R11, SI 1511 ADDQ R11, BX 1512 JMP copy_1_end 1513 1514 copy_1_move_8through16: 1515 MOVQ (SI), R14 1516 MOVQ -8(SI)(R11*1), R15 1517 MOVQ R14, (BX) 1518 MOVQ R15, -8(BX)(R11*1) 1519 ADDQ R11, SI 1520 ADDQ R11, BX 1521 1522 copy_1_end: 1523 ADDQ R11, DI 1524 1525 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 1526 check_offset: 1527 LEAQ (DI)(R10*1), R11 1528 CMPQ R12, R11 1529 JG error_match_off_too_big 1530 CMPQ R12, R8 1531 JG error_match_off_too_big 1532 1533 // Copy match from history 1534 MOVQ R12, R11 1535 SUBQ DI, R11 1536 JLS copy_match 1537 MOVQ R9, R14 1538 SUBQ R11, R14 1539 CMPQ R13, R11 1540 JG copy_all_from_history 1541 MOVQ R13, R11 1542 SUBQ $0x10, R11 1543 JB copy_4_small 1544 1545 copy_4_loop: 1546 MOVUPS (R14), X0 1547 MOVUPS X0, (BX) 1548 ADDQ $0x10, R14 1549 ADDQ $0x10, BX 1550 SUBQ $0x10, R11 1551 JAE copy_4_loop 1552 LEAQ 16(R14)(R11*1), R14 1553 LEAQ 16(BX)(R11*1), BX 1554 MOVUPS -16(R14), X0 1555 MOVUPS X0, -16(BX) 1556 JMP copy_4_end 1557 1558 copy_4_small: 1559 CMPQ R13, $0x03 1560 JE copy_4_move_3 1561 CMPQ R13, $0x08 1562 JB copy_4_move_4through7 1563 JMP copy_4_move_8through16 1564 1565 copy_4_move_3: 1566 MOVW (R14), R11 1567 MOVB 2(R14), R12 1568 MOVW R11, (BX) 1569 MOVB R12, 2(BX) 1570 ADDQ R13, R14 1571 ADDQ R13, BX 1572 JMP copy_4_end 1573 1574 copy_4_move_4through7: 1575 MOVL (R14), R11 1576 MOVL -4(R14)(R13*1), R12 1577 MOVL R11, (BX) 1578 MOVL R12, -4(BX)(R13*1) 1579 ADDQ R13, R14 1580 ADDQ R13, BX 1581 JMP copy_4_end 1582 1583 copy_4_move_8through16: 1584 MOVQ (R14), R11 1585 MOVQ -8(R14)(R13*1), R12 1586 MOVQ R11, (BX) 1587 MOVQ R12, -8(BX)(R13*1) 1588 ADDQ R13, R14 1589 ADDQ R13, BX 1590 1591 copy_4_end: 1592 ADDQ R13, DI 1593 ADDQ $0x18, AX 1594 INCQ DX 1595 CMPQ DX, CX 1596 JB main_loop 1597 JMP loop_finished 1598 1599 copy_all_from_history: 1600 MOVQ R11, R15 1601 SUBQ $0x10, R15 1602 JB copy_5_small 1603 1604 copy_5_loop: 1605 MOVUPS (R14), X0 1606 MOVUPS X0, (BX) 1607 ADDQ $0x10, R14 1608 ADDQ $0x10, BX 1609 SUBQ $0x10, R15 1610 JAE copy_5_loop 1611 LEAQ 16(R14)(R15*1), R14 1612 LEAQ 16(BX)(R15*1), BX 1613 MOVUPS -16(R14), X0 1614 MOVUPS X0, -16(BX) 1615 JMP copy_5_end 1616 1617 copy_5_small: 1618 CMPQ R11, $0x03 1619 JE copy_5_move_3 1620 JB copy_5_move_1or2 1621 CMPQ R11, $0x08 1622 JB copy_5_move_4through7 1623 JMP copy_5_move_8through16 1624 1625 copy_5_move_1or2: 1626 MOVB (R14), R15 1627 MOVB -1(R14)(R11*1), BP 1628 MOVB R15, (BX) 1629 MOVB BP, -1(BX)(R11*1) 1630 ADDQ R11, R14 1631 ADDQ R11, BX 1632 JMP copy_5_end 1633 1634 copy_5_move_3: 1635 MOVW (R14), R15 1636 MOVB 2(R14), BP 1637 MOVW R15, (BX) 1638 MOVB BP, 2(BX) 1639 ADDQ R11, R14 1640 ADDQ R11, BX 1641 JMP copy_5_end 1642 1643 copy_5_move_4through7: 1644 MOVL (R14), R15 1645 MOVL -4(R14)(R11*1), BP 1646 MOVL R15, (BX) 1647 MOVL BP, -4(BX)(R11*1) 1648 ADDQ R11, R14 1649 ADDQ R11, BX 1650 JMP copy_5_end 1651 1652 copy_5_move_8through16: 1653 MOVQ (R14), R15 1654 MOVQ -8(R14)(R11*1), BP 1655 MOVQ R15, (BX) 1656 MOVQ BP, -8(BX)(R11*1) 1657 ADDQ R11, R14 1658 ADDQ R11, BX 1659 1660 copy_5_end: 1661 ADDQ R11, DI 1662 SUBQ R11, R13 1663 1664 // Copy match from the current buffer 1665 copy_match: 1666 MOVQ BX, R11 1667 SUBQ R12, R11 1668 1669 // ml <= mo 1670 CMPQ R13, R12 1671 JA copy_overlapping_match 1672 1673 // Copy non-overlapping match 1674 ADDQ R13, DI 1675 MOVQ R13, R12 1676 SUBQ $0x10, R12 1677 JB copy_2_small 1678 1679 copy_2_loop: 1680 MOVUPS (R11), X0 1681 MOVUPS X0, (BX) 1682 ADDQ $0x10, R11 1683 ADDQ $0x10, BX 1684 SUBQ $0x10, R12 1685 JAE copy_2_loop 1686 LEAQ 16(R11)(R12*1), R11 1687 LEAQ 16(BX)(R12*1), BX 1688 MOVUPS -16(R11), X0 1689 MOVUPS X0, -16(BX) 1690 JMP copy_2_end 1691 1692 copy_2_small: 1693 CMPQ R13, $0x03 1694 JE copy_2_move_3 1695 JB copy_2_move_1or2 1696 CMPQ R13, $0x08 1697 JB copy_2_move_4through7 1698 JMP copy_2_move_8through16 1699 1700 copy_2_move_1or2: 1701 MOVB (R11), R12 1702 MOVB -1(R11)(R13*1), R14 1703 MOVB R12, (BX) 1704 MOVB R14, -1(BX)(R13*1) 1705 ADDQ R13, R11 1706 ADDQ R13, BX 1707 JMP copy_2_end 1708 1709 copy_2_move_3: 1710 MOVW (R11), R12 1711 MOVB 2(R11), R14 1712 MOVW R12, (BX) 1713 MOVB R14, 2(BX) 1714 ADDQ R13, R11 1715 ADDQ R13, BX 1716 JMP copy_2_end 1717 1718 copy_2_move_4through7: 1719 MOVL (R11), R12 1720 MOVL -4(R11)(R13*1), R14 1721 MOVL R12, (BX) 1722 MOVL R14, -4(BX)(R13*1) 1723 ADDQ R13, R11 1724 ADDQ R13, BX 1725 JMP copy_2_end 1726 1727 copy_2_move_8through16: 1728 MOVQ (R11), R12 1729 MOVQ -8(R11)(R13*1), R14 1730 MOVQ R12, (BX) 1731 MOVQ R14, -8(BX)(R13*1) 1732 ADDQ R13, R11 1733 ADDQ R13, BX 1734 1735 copy_2_end: 1736 JMP handle_loop 1737 1738 // Copy overlapping match 1739 copy_overlapping_match: 1740 ADDQ R13, DI 1741 1742 copy_slow_3: 1743 MOVB (R11), R12 1744 MOVB R12, (BX) 1745 INCQ R11 1746 INCQ BX 1747 DECQ R13 1748 JNZ copy_slow_3 1749 1750 handle_loop: 1751 ADDQ $0x18, AX 1752 INCQ DX 1753 CMPQ DX, CX 1754 JB main_loop 1755 1756 loop_finished: 1757 // Return value 1758 MOVB $0x01, ret+8(FP) 1759 1760 // Update the context 1761 MOVQ ctx+0(FP), AX 1762 MOVQ DX, 24(AX) 1763 MOVQ DI, 104(AX) 1764 SUBQ 80(AX), SI 1765 MOVQ SI, 112(AX) 1766 RET 1767 1768 error_match_off_too_big: 1769 // Return value 1770 MOVB $0x00, ret+8(FP) 1771 1772 // Update the context 1773 MOVQ ctx+0(FP), AX 1774 MOVQ DX, 24(AX) 1775 MOVQ DI, 104(AX) 1776 SUBQ 80(AX), SI 1777 MOVQ SI, 112(AX) 1778 RET 1779 1780 empty_seqs: 1781 // Return value 1782 MOVB $0x01, ret+8(FP) 1783 RET 1784 1785 // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int 1786 // Requires: CMOV, SSE 1787 TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 1788 MOVQ br+8(FP), CX 1789 MOVQ 24(CX), DX 1790 MOVBQZX 32(CX), BX 1791 MOVQ (CX), AX 1792 MOVQ 8(CX), SI 1793 ADDQ SI, AX 1794 MOVQ AX, (SP) 1795 MOVQ ctx+16(FP), AX 1796 MOVQ 72(AX), DI 1797 MOVQ 80(AX), R8 1798 MOVQ 88(AX), R9 1799 XORQ CX, CX 1800 MOVQ CX, 8(SP) 1801 MOVQ CX, 16(SP) 1802 MOVQ CX, 24(SP) 1803 MOVQ 112(AX), R10 1804 MOVQ 128(AX), CX 1805 MOVQ CX, 32(SP) 1806 MOVQ 144(AX), R11 1807 MOVQ 136(AX), R12 1808 MOVQ 200(AX), CX 1809 MOVQ CX, 56(SP) 1810 MOVQ 176(AX), CX 1811 MOVQ CX, 48(SP) 1812 MOVQ 184(AX), AX 1813 MOVQ AX, 40(SP) 1814 MOVQ 40(SP), AX 1815 ADDQ AX, 48(SP) 1816 1817 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) 1818 ADDQ R10, 32(SP) 1819 1820 // outBase += outPosition 1821 ADDQ R12, R10 1822 1823 sequenceDecs_decodeSync_amd64_main_loop: 1824 MOVQ (SP), R13 1825 1826 // Fill bitreader to have enough for the offset and match length. 1827 CMPQ SI, $0x08 1828 JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte 1829 MOVQ BX, AX 1830 SHRQ $0x03, AX 1831 SUBQ AX, R13 1832 MOVQ (R13), DX 1833 SUBQ AX, SI 1834 ANDQ $0x07, BX 1835 JMP sequenceDecs_decodeSync_amd64_fill_end 1836 1837 sequenceDecs_decodeSync_amd64_fill_byte_by_byte: 1838 CMPQ SI, $0x00 1839 JLE sequenceDecs_decodeSync_amd64_fill_check_overread 1840 CMPQ BX, $0x07 1841 JLE sequenceDecs_decodeSync_amd64_fill_end 1842 SHLQ $0x08, DX 1843 SUBQ $0x01, R13 1844 SUBQ $0x01, SI 1845 SUBQ $0x08, BX 1846 MOVBQZX (R13), AX 1847 ORQ AX, DX 1848 JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte 1849 1850 sequenceDecs_decodeSync_amd64_fill_check_overread: 1851 CMPQ BX, $0x40 1852 JA error_overread 1853 1854 sequenceDecs_decodeSync_amd64_fill_end: 1855 // Update offset 1856 MOVQ R9, AX 1857 MOVQ BX, CX 1858 MOVQ DX, R14 1859 SHLQ CL, R14 1860 MOVB AH, CL 1861 SHRQ $0x20, AX 1862 TESTQ CX, CX 1863 JZ sequenceDecs_decodeSync_amd64_of_update_zero 1864 ADDQ CX, BX 1865 CMPQ BX, $0x40 1866 JA sequenceDecs_decodeSync_amd64_of_update_zero 1867 CMPQ CX, $0x40 1868 JAE sequenceDecs_decodeSync_amd64_of_update_zero 1869 NEGQ CX 1870 SHRQ CL, R14 1871 ADDQ R14, AX 1872 1873 sequenceDecs_decodeSync_amd64_of_update_zero: 1874 MOVQ AX, 8(SP) 1875 1876 // Update match length 1877 MOVQ R8, AX 1878 MOVQ BX, CX 1879 MOVQ DX, R14 1880 SHLQ CL, R14 1881 MOVB AH, CL 1882 SHRQ $0x20, AX 1883 TESTQ CX, CX 1884 JZ sequenceDecs_decodeSync_amd64_ml_update_zero 1885 ADDQ CX, BX 1886 CMPQ BX, $0x40 1887 JA sequenceDecs_decodeSync_amd64_ml_update_zero 1888 CMPQ CX, $0x40 1889 JAE sequenceDecs_decodeSync_amd64_ml_update_zero 1890 NEGQ CX 1891 SHRQ CL, R14 1892 ADDQ R14, AX 1893 1894 sequenceDecs_decodeSync_amd64_ml_update_zero: 1895 MOVQ AX, 16(SP) 1896 1897 // Fill bitreader to have enough for the remaining 1898 CMPQ SI, $0x08 1899 JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte 1900 MOVQ BX, AX 1901 SHRQ $0x03, AX 1902 SUBQ AX, R13 1903 MOVQ (R13), DX 1904 SUBQ AX, SI 1905 ANDQ $0x07, BX 1906 JMP sequenceDecs_decodeSync_amd64_fill_2_end 1907 1908 sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: 1909 CMPQ SI, $0x00 1910 JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread 1911 CMPQ BX, $0x07 1912 JLE sequenceDecs_decodeSync_amd64_fill_2_end 1913 SHLQ $0x08, DX 1914 SUBQ $0x01, R13 1915 SUBQ $0x01, SI 1916 SUBQ $0x08, BX 1917 MOVBQZX (R13), AX 1918 ORQ AX, DX 1919 JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte 1920 1921 sequenceDecs_decodeSync_amd64_fill_2_check_overread: 1922 CMPQ BX, $0x40 1923 JA error_overread 1924 1925 sequenceDecs_decodeSync_amd64_fill_2_end: 1926 // Update literal length 1927 MOVQ DI, AX 1928 MOVQ BX, CX 1929 MOVQ DX, R14 1930 SHLQ CL, R14 1931 MOVB AH, CL 1932 SHRQ $0x20, AX 1933 TESTQ CX, CX 1934 JZ sequenceDecs_decodeSync_amd64_ll_update_zero 1935 ADDQ CX, BX 1936 CMPQ BX, $0x40 1937 JA sequenceDecs_decodeSync_amd64_ll_update_zero 1938 CMPQ CX, $0x40 1939 JAE sequenceDecs_decodeSync_amd64_ll_update_zero 1940 NEGQ CX 1941 SHRQ CL, R14 1942 ADDQ R14, AX 1943 1944 sequenceDecs_decodeSync_amd64_ll_update_zero: 1945 MOVQ AX, 24(SP) 1946 1947 // Fill bitreader for state updates 1948 MOVQ R13, (SP) 1949 MOVQ R9, AX 1950 SHRQ $0x08, AX 1951 MOVBQZX AL, AX 1952 MOVQ ctx+16(FP), CX 1953 CMPQ 96(CX), $0x00 1954 JZ sequenceDecs_decodeSync_amd64_skip_update 1955 1956 // Update Literal Length State 1957 MOVBQZX DI, R13 1958 SHRL $0x10, DI 1959 LEAQ (BX)(R13*1), CX 1960 MOVQ DX, R14 1961 MOVQ CX, BX 1962 ROLQ CL, R14 1963 MOVL $0x00000001, R15 1964 MOVB R13, CL 1965 SHLL CL, R15 1966 DECL R15 1967 ANDQ R15, R14 1968 ADDQ R14, DI 1969 1970 // Load ctx.llTable 1971 MOVQ ctx+16(FP), CX 1972 MOVQ (CX), CX 1973 MOVQ (CX)(DI*8), DI 1974 1975 // Update Match Length State 1976 MOVBQZX R8, R13 1977 SHRL $0x10, R8 1978 LEAQ (BX)(R13*1), CX 1979 MOVQ DX, R14 1980 MOVQ CX, BX 1981 ROLQ CL, R14 1982 MOVL $0x00000001, R15 1983 MOVB R13, CL 1984 SHLL CL, R15 1985 DECL R15 1986 ANDQ R15, R14 1987 ADDQ R14, R8 1988 1989 // Load ctx.mlTable 1990 MOVQ ctx+16(FP), CX 1991 MOVQ 24(CX), CX 1992 MOVQ (CX)(R8*8), R8 1993 1994 // Update Offset State 1995 MOVBQZX R9, R13 1996 SHRL $0x10, R9 1997 LEAQ (BX)(R13*1), CX 1998 MOVQ DX, R14 1999 MOVQ CX, BX 2000 ROLQ CL, R14 2001 MOVL $0x00000001, R15 2002 MOVB R13, CL 2003 SHLL CL, R15 2004 DECL R15 2005 ANDQ R15, R14 2006 ADDQ R14, R9 2007 2008 // Load ctx.ofTable 2009 MOVQ ctx+16(FP), CX 2010 MOVQ 48(CX), CX 2011 MOVQ (CX)(R9*8), R9 2012 2013 sequenceDecs_decodeSync_amd64_skip_update: 2014 // Adjust offset 2015 MOVQ s+0(FP), CX 2016 MOVQ 8(SP), R13 2017 CMPQ AX, $0x01 2018 JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 2019 MOVUPS 144(CX), X0 2020 MOVQ R13, 144(CX) 2021 MOVUPS X0, 152(CX) 2022 JMP sequenceDecs_decodeSync_amd64_after_adjust 2023 2024 sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: 2025 CMPQ 24(SP), $0x00000000 2026 JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero 2027 INCQ R13 2028 JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero 2029 2030 sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: 2031 TESTQ R13, R13 2032 JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero 2033 MOVQ 144(CX), R13 2034 JMP sequenceDecs_decodeSync_amd64_after_adjust 2035 2036 sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: 2037 MOVQ R13, AX 2038 XORQ R14, R14 2039 MOVQ $-1, R15 2040 CMPQ R13, $0x03 2041 CMOVQEQ R14, AX 2042 CMOVQEQ R15, R14 2043 ADDQ 144(CX)(AX*8), R14 2044 JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid 2045 MOVQ $0x00000001, R14 2046 2047 sequenceDecs_decodeSync_amd64_adjust_temp_valid: 2048 CMPQ R13, $0x01 2049 JZ sequenceDecs_decodeSync_amd64_adjust_skip 2050 MOVQ 152(CX), AX 2051 MOVQ AX, 160(CX) 2052 2053 sequenceDecs_decodeSync_amd64_adjust_skip: 2054 MOVQ 144(CX), AX 2055 MOVQ AX, 152(CX) 2056 MOVQ R14, 144(CX) 2057 MOVQ R14, R13 2058 2059 sequenceDecs_decodeSync_amd64_after_adjust: 2060 MOVQ R13, 8(SP) 2061 2062 // Check values 2063 MOVQ 16(SP), AX 2064 MOVQ 24(SP), CX 2065 LEAQ (AX)(CX*1), R14 2066 MOVQ s+0(FP), R15 2067 ADDQ R14, 256(R15) 2068 MOVQ ctx+16(FP), R14 2069 SUBQ CX, 104(R14) 2070 JS error_not_enough_literals 2071 CMPQ AX, $0x00020002 2072 JA sequenceDecs_decodeSync_amd64_error_match_len_too_big 2073 TESTQ R13, R13 2074 JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok 2075 TESTQ AX, AX 2076 JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch 2077 2078 sequenceDecs_decodeSync_amd64_match_len_ofs_ok: 2079 MOVQ 24(SP), AX 2080 MOVQ 8(SP), CX 2081 MOVQ 16(SP), R13 2082 2083 // Check if we have enough space in s.out 2084 LEAQ (AX)(R13*1), R14 2085 ADDQ R10, R14 2086 CMPQ R14, 32(SP) 2087 JA error_not_enough_space 2088 2089 // Copy literals 2090 TESTQ AX, AX 2091 JZ check_offset 2092 XORQ R14, R14 2093 2094 copy_1: 2095 MOVUPS (R11)(R14*1), X0 2096 MOVUPS X0, (R10)(R14*1) 2097 ADDQ $0x10, R14 2098 CMPQ R14, AX 2099 JB copy_1 2100 ADDQ AX, R11 2101 ADDQ AX, R10 2102 ADDQ AX, R12 2103 2104 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 2105 check_offset: 2106 MOVQ R12, AX 2107 ADDQ 40(SP), AX 2108 CMPQ CX, AX 2109 JG error_match_off_too_big 2110 CMPQ CX, 56(SP) 2111 JG error_match_off_too_big 2112 2113 // Copy match from history 2114 MOVQ CX, AX 2115 SUBQ R12, AX 2116 JLS copy_match 2117 MOVQ 48(SP), R14 2118 SUBQ AX, R14 2119 CMPQ R13, AX 2120 JG copy_all_from_history 2121 MOVQ R13, AX 2122 SUBQ $0x10, AX 2123 JB copy_4_small 2124 2125 copy_4_loop: 2126 MOVUPS (R14), X0 2127 MOVUPS X0, (R10) 2128 ADDQ $0x10, R14 2129 ADDQ $0x10, R10 2130 SUBQ $0x10, AX 2131 JAE copy_4_loop 2132 LEAQ 16(R14)(AX*1), R14 2133 LEAQ 16(R10)(AX*1), R10 2134 MOVUPS -16(R14), X0 2135 MOVUPS X0, -16(R10) 2136 JMP copy_4_end 2137 2138 copy_4_small: 2139 CMPQ R13, $0x03 2140 JE copy_4_move_3 2141 CMPQ R13, $0x08 2142 JB copy_4_move_4through7 2143 JMP copy_4_move_8through16 2144 2145 copy_4_move_3: 2146 MOVW (R14), AX 2147 MOVB 2(R14), CL 2148 MOVW AX, (R10) 2149 MOVB CL, 2(R10) 2150 ADDQ R13, R14 2151 ADDQ R13, R10 2152 JMP copy_4_end 2153 2154 copy_4_move_4through7: 2155 MOVL (R14), AX 2156 MOVL -4(R14)(R13*1), CX 2157 MOVL AX, (R10) 2158 MOVL CX, -4(R10)(R13*1) 2159 ADDQ R13, R14 2160 ADDQ R13, R10 2161 JMP copy_4_end 2162 2163 copy_4_move_8through16: 2164 MOVQ (R14), AX 2165 MOVQ -8(R14)(R13*1), CX 2166 MOVQ AX, (R10) 2167 MOVQ CX, -8(R10)(R13*1) 2168 ADDQ R13, R14 2169 ADDQ R13, R10 2170 2171 copy_4_end: 2172 ADDQ R13, R12 2173 JMP handle_loop 2174 JMP loop_finished 2175 2176 copy_all_from_history: 2177 MOVQ AX, R15 2178 SUBQ $0x10, R15 2179 JB copy_5_small 2180 2181 copy_5_loop: 2182 MOVUPS (R14), X0 2183 MOVUPS X0, (R10) 2184 ADDQ $0x10, R14 2185 ADDQ $0x10, R10 2186 SUBQ $0x10, R15 2187 JAE copy_5_loop 2188 LEAQ 16(R14)(R15*1), R14 2189 LEAQ 16(R10)(R15*1), R10 2190 MOVUPS -16(R14), X0 2191 MOVUPS X0, -16(R10) 2192 JMP copy_5_end 2193 2194 copy_5_small: 2195 CMPQ AX, $0x03 2196 JE copy_5_move_3 2197 JB copy_5_move_1or2 2198 CMPQ AX, $0x08 2199 JB copy_5_move_4through7 2200 JMP copy_5_move_8through16 2201 2202 copy_5_move_1or2: 2203 MOVB (R14), R15 2204 MOVB -1(R14)(AX*1), BP 2205 MOVB R15, (R10) 2206 MOVB BP, -1(R10)(AX*1) 2207 ADDQ AX, R14 2208 ADDQ AX, R10 2209 JMP copy_5_end 2210 2211 copy_5_move_3: 2212 MOVW (R14), R15 2213 MOVB 2(R14), BP 2214 MOVW R15, (R10) 2215 MOVB BP, 2(R10) 2216 ADDQ AX, R14 2217 ADDQ AX, R10 2218 JMP copy_5_end 2219 2220 copy_5_move_4through7: 2221 MOVL (R14), R15 2222 MOVL -4(R14)(AX*1), BP 2223 MOVL R15, (R10) 2224 MOVL BP, -4(R10)(AX*1) 2225 ADDQ AX, R14 2226 ADDQ AX, R10 2227 JMP copy_5_end 2228 2229 copy_5_move_8through16: 2230 MOVQ (R14), R15 2231 MOVQ -8(R14)(AX*1), BP 2232 MOVQ R15, (R10) 2233 MOVQ BP, -8(R10)(AX*1) 2234 ADDQ AX, R14 2235 ADDQ AX, R10 2236 2237 copy_5_end: 2238 ADDQ AX, R12 2239 SUBQ AX, R13 2240 2241 // Copy match from the current buffer 2242 copy_match: 2243 MOVQ R10, AX 2244 SUBQ CX, AX 2245 2246 // ml <= mo 2247 CMPQ R13, CX 2248 JA copy_overlapping_match 2249 2250 // Copy non-overlapping match 2251 ADDQ R13, R12 2252 MOVQ R10, CX 2253 ADDQ R13, R10 2254 2255 copy_2: 2256 MOVUPS (AX), X0 2257 MOVUPS X0, (CX) 2258 ADDQ $0x10, AX 2259 ADDQ $0x10, CX 2260 SUBQ $0x10, R13 2261 JHI copy_2 2262 JMP handle_loop 2263 2264 // Copy overlapping match 2265 copy_overlapping_match: 2266 ADDQ R13, R12 2267 2268 copy_slow_3: 2269 MOVB (AX), CL 2270 MOVB CL, (R10) 2271 INCQ AX 2272 INCQ R10 2273 DECQ R13 2274 JNZ copy_slow_3 2275 2276 handle_loop: 2277 MOVQ ctx+16(FP), AX 2278 DECQ 96(AX) 2279 JNS sequenceDecs_decodeSync_amd64_main_loop 2280 2281 loop_finished: 2282 MOVQ br+8(FP), AX 2283 MOVQ DX, 24(AX) 2284 MOVB BL, 32(AX) 2285 MOVQ SI, 8(AX) 2286 2287 // Update the context 2288 MOVQ ctx+16(FP), AX 2289 MOVQ R12, 136(AX) 2290 MOVQ 144(AX), CX 2291 SUBQ CX, R11 2292 MOVQ R11, 168(AX) 2293 2294 // Return success 2295 MOVQ $0x00000000, ret+24(FP) 2296 RET 2297 2298 // Return with match length error 2299 sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: 2300 MOVQ 16(SP), AX 2301 MOVQ ctx+16(FP), CX 2302 MOVQ AX, 216(CX) 2303 MOVQ $0x00000001, ret+24(FP) 2304 RET 2305 2306 // Return with match too long error 2307 sequenceDecs_decodeSync_amd64_error_match_len_too_big: 2308 MOVQ ctx+16(FP), AX 2309 MOVQ 16(SP), CX 2310 MOVQ CX, 216(AX) 2311 MOVQ $0x00000002, ret+24(FP) 2312 RET 2313 2314 // Return with match offset too long error 2315 error_match_off_too_big: 2316 MOVQ ctx+16(FP), AX 2317 MOVQ 8(SP), CX 2318 MOVQ CX, 224(AX) 2319 MOVQ R12, 136(AX) 2320 MOVQ $0x00000003, ret+24(FP) 2321 RET 2322 2323 // Return with not enough literals error 2324 error_not_enough_literals: 2325 MOVQ ctx+16(FP), AX 2326 MOVQ 24(SP), CX 2327 MOVQ CX, 208(AX) 2328 MOVQ $0x00000004, ret+24(FP) 2329 RET 2330 2331 // Return with overread error 2332 error_overread: 2333 MOVQ $0x00000006, ret+24(FP) 2334 RET 2335 2336 // Return with not enough output space error 2337 error_not_enough_space: 2338 MOVQ ctx+16(FP), AX 2339 MOVQ 24(SP), CX 2340 MOVQ CX, 208(AX) 2341 MOVQ 16(SP), CX 2342 MOVQ CX, 216(AX) 2343 MOVQ R12, 136(AX) 2344 MOVQ $0x00000005, ret+24(FP) 2345 RET 2346 2347 // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int 2348 // Requires: BMI, BMI2, CMOV, SSE 2349 TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 2350 MOVQ br+8(FP), BX 2351 MOVQ 24(BX), AX 2352 MOVBQZX 32(BX), DX 2353 MOVQ (BX), CX 2354 MOVQ 8(BX), BX 2355 ADDQ BX, CX 2356 MOVQ CX, (SP) 2357 MOVQ ctx+16(FP), CX 2358 MOVQ 72(CX), SI 2359 MOVQ 80(CX), DI 2360 MOVQ 88(CX), R8 2361 XORQ R9, R9 2362 MOVQ R9, 8(SP) 2363 MOVQ R9, 16(SP) 2364 MOVQ R9, 24(SP) 2365 MOVQ 112(CX), R9 2366 MOVQ 128(CX), R10 2367 MOVQ R10, 32(SP) 2368 MOVQ 144(CX), R10 2369 MOVQ 136(CX), R11 2370 MOVQ 200(CX), R12 2371 MOVQ R12, 56(SP) 2372 MOVQ 176(CX), R12 2373 MOVQ R12, 48(SP) 2374 MOVQ 184(CX), CX 2375 MOVQ CX, 40(SP) 2376 MOVQ 40(SP), CX 2377 ADDQ CX, 48(SP) 2378 2379 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) 2380 ADDQ R9, 32(SP) 2381 2382 // outBase += outPosition 2383 ADDQ R11, R9 2384 2385 sequenceDecs_decodeSync_bmi2_main_loop: 2386 MOVQ (SP), R12 2387 2388 // Fill bitreader to have enough for the offset and match length. 2389 CMPQ BX, $0x08 2390 JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte 2391 MOVQ DX, CX 2392 SHRQ $0x03, CX 2393 SUBQ CX, R12 2394 MOVQ (R12), AX 2395 SUBQ CX, BX 2396 ANDQ $0x07, DX 2397 JMP sequenceDecs_decodeSync_bmi2_fill_end 2398 2399 sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: 2400 CMPQ BX, $0x00 2401 JLE sequenceDecs_decodeSync_bmi2_fill_check_overread 2402 CMPQ DX, $0x07 2403 JLE sequenceDecs_decodeSync_bmi2_fill_end 2404 SHLQ $0x08, AX 2405 SUBQ $0x01, R12 2406 SUBQ $0x01, BX 2407 SUBQ $0x08, DX 2408 MOVBQZX (R12), CX 2409 ORQ CX, AX 2410 JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte 2411 2412 sequenceDecs_decodeSync_bmi2_fill_check_overread: 2413 CMPQ DX, $0x40 2414 JA error_overread 2415 2416 sequenceDecs_decodeSync_bmi2_fill_end: 2417 // Update offset 2418 MOVQ $0x00000808, CX 2419 BEXTRQ CX, R8, R13 2420 MOVQ AX, R14 2421 LEAQ (DX)(R13*1), CX 2422 ROLQ CL, R14 2423 BZHIQ R13, R14, R14 2424 MOVQ CX, DX 2425 MOVQ R8, CX 2426 SHRQ $0x20, CX 2427 ADDQ R14, CX 2428 MOVQ CX, 8(SP) 2429 2430 // Update match length 2431 MOVQ $0x00000808, CX 2432 BEXTRQ CX, DI, R13 2433 MOVQ AX, R14 2434 LEAQ (DX)(R13*1), CX 2435 ROLQ CL, R14 2436 BZHIQ R13, R14, R14 2437 MOVQ CX, DX 2438 MOVQ DI, CX 2439 SHRQ $0x20, CX 2440 ADDQ R14, CX 2441 MOVQ CX, 16(SP) 2442 2443 // Fill bitreader to have enough for the remaining 2444 CMPQ BX, $0x08 2445 JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte 2446 MOVQ DX, CX 2447 SHRQ $0x03, CX 2448 SUBQ CX, R12 2449 MOVQ (R12), AX 2450 SUBQ CX, BX 2451 ANDQ $0x07, DX 2452 JMP sequenceDecs_decodeSync_bmi2_fill_2_end 2453 2454 sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: 2455 CMPQ BX, $0x00 2456 JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread 2457 CMPQ DX, $0x07 2458 JLE sequenceDecs_decodeSync_bmi2_fill_2_end 2459 SHLQ $0x08, AX 2460 SUBQ $0x01, R12 2461 SUBQ $0x01, BX 2462 SUBQ $0x08, DX 2463 MOVBQZX (R12), CX 2464 ORQ CX, AX 2465 JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte 2466 2467 sequenceDecs_decodeSync_bmi2_fill_2_check_overread: 2468 CMPQ DX, $0x40 2469 JA error_overread 2470 2471 sequenceDecs_decodeSync_bmi2_fill_2_end: 2472 // Update literal length 2473 MOVQ $0x00000808, CX 2474 BEXTRQ CX, SI, R13 2475 MOVQ AX, R14 2476 LEAQ (DX)(R13*1), CX 2477 ROLQ CL, R14 2478 BZHIQ R13, R14, R14 2479 MOVQ CX, DX 2480 MOVQ SI, CX 2481 SHRQ $0x20, CX 2482 ADDQ R14, CX 2483 MOVQ CX, 24(SP) 2484 2485 // Fill bitreader for state updates 2486 MOVQ R12, (SP) 2487 MOVQ $0x00000808, CX 2488 BEXTRQ CX, R8, R12 2489 MOVQ ctx+16(FP), CX 2490 CMPQ 96(CX), $0x00 2491 JZ sequenceDecs_decodeSync_bmi2_skip_update 2492 LEAQ (SI)(DI*1), R13 2493 ADDQ R8, R13 2494 MOVBQZX R13, R13 2495 LEAQ (DX)(R13*1), CX 2496 MOVQ AX, R14 2497 MOVQ CX, DX 2498 ROLQ CL, R14 2499 BZHIQ R13, R14, R14 2500 2501 // Update Offset State 2502 BZHIQ R8, R14, CX 2503 SHRXQ R8, R14, R14 2504 SHRL $0x10, R8 2505 ADDQ CX, R8 2506 2507 // Load ctx.ofTable 2508 MOVQ ctx+16(FP), CX 2509 MOVQ 48(CX), CX 2510 MOVQ (CX)(R8*8), R8 2511 2512 // Update Match Length State 2513 BZHIQ DI, R14, CX 2514 SHRXQ DI, R14, R14 2515 SHRL $0x10, DI 2516 ADDQ CX, DI 2517 2518 // Load ctx.mlTable 2519 MOVQ ctx+16(FP), CX 2520 MOVQ 24(CX), CX 2521 MOVQ (CX)(DI*8), DI 2522 2523 // Update Literal Length State 2524 BZHIQ SI, R14, CX 2525 SHRL $0x10, SI 2526 ADDQ CX, SI 2527 2528 // Load ctx.llTable 2529 MOVQ ctx+16(FP), CX 2530 MOVQ (CX), CX 2531 MOVQ (CX)(SI*8), SI 2532 2533 sequenceDecs_decodeSync_bmi2_skip_update: 2534 // Adjust offset 2535 MOVQ s+0(FP), CX 2536 MOVQ 8(SP), R13 2537 CMPQ R12, $0x01 2538 JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 2539 MOVUPS 144(CX), X0 2540 MOVQ R13, 144(CX) 2541 MOVUPS X0, 152(CX) 2542 JMP sequenceDecs_decodeSync_bmi2_after_adjust 2543 2544 sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: 2545 CMPQ 24(SP), $0x00000000 2546 JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero 2547 INCQ R13 2548 JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero 2549 2550 sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: 2551 TESTQ R13, R13 2552 JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero 2553 MOVQ 144(CX), R13 2554 JMP sequenceDecs_decodeSync_bmi2_after_adjust 2555 2556 sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: 2557 MOVQ R13, R12 2558 XORQ R14, R14 2559 MOVQ $-1, R15 2560 CMPQ R13, $0x03 2561 CMOVQEQ R14, R12 2562 CMOVQEQ R15, R14 2563 ADDQ 144(CX)(R12*8), R14 2564 JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid 2565 MOVQ $0x00000001, R14 2566 2567 sequenceDecs_decodeSync_bmi2_adjust_temp_valid: 2568 CMPQ R13, $0x01 2569 JZ sequenceDecs_decodeSync_bmi2_adjust_skip 2570 MOVQ 152(CX), R12 2571 MOVQ R12, 160(CX) 2572 2573 sequenceDecs_decodeSync_bmi2_adjust_skip: 2574 MOVQ 144(CX), R12 2575 MOVQ R12, 152(CX) 2576 MOVQ R14, 144(CX) 2577 MOVQ R14, R13 2578 2579 sequenceDecs_decodeSync_bmi2_after_adjust: 2580 MOVQ R13, 8(SP) 2581 2582 // Check values 2583 MOVQ 16(SP), CX 2584 MOVQ 24(SP), R12 2585 LEAQ (CX)(R12*1), R14 2586 MOVQ s+0(FP), R15 2587 ADDQ R14, 256(R15) 2588 MOVQ ctx+16(FP), R14 2589 SUBQ R12, 104(R14) 2590 JS error_not_enough_literals 2591 CMPQ CX, $0x00020002 2592 JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big 2593 TESTQ R13, R13 2594 JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok 2595 TESTQ CX, CX 2596 JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch 2597 2598 sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: 2599 MOVQ 24(SP), CX 2600 MOVQ 8(SP), R12 2601 MOVQ 16(SP), R13 2602 2603 // Check if we have enough space in s.out 2604 LEAQ (CX)(R13*1), R14 2605 ADDQ R9, R14 2606 CMPQ R14, 32(SP) 2607 JA error_not_enough_space 2608 2609 // Copy literals 2610 TESTQ CX, CX 2611 JZ check_offset 2612 XORQ R14, R14 2613 2614 copy_1: 2615 MOVUPS (R10)(R14*1), X0 2616 MOVUPS X0, (R9)(R14*1) 2617 ADDQ $0x10, R14 2618 CMPQ R14, CX 2619 JB copy_1 2620 ADDQ CX, R10 2621 ADDQ CX, R9 2622 ADDQ CX, R11 2623 2624 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 2625 check_offset: 2626 MOVQ R11, CX 2627 ADDQ 40(SP), CX 2628 CMPQ R12, CX 2629 JG error_match_off_too_big 2630 CMPQ R12, 56(SP) 2631 JG error_match_off_too_big 2632 2633 // Copy match from history 2634 MOVQ R12, CX 2635 SUBQ R11, CX 2636 JLS copy_match 2637 MOVQ 48(SP), R14 2638 SUBQ CX, R14 2639 CMPQ R13, CX 2640 JG copy_all_from_history 2641 MOVQ R13, CX 2642 SUBQ $0x10, CX 2643 JB copy_4_small 2644 2645 copy_4_loop: 2646 MOVUPS (R14), X0 2647 MOVUPS X0, (R9) 2648 ADDQ $0x10, R14 2649 ADDQ $0x10, R9 2650 SUBQ $0x10, CX 2651 JAE copy_4_loop 2652 LEAQ 16(R14)(CX*1), R14 2653 LEAQ 16(R9)(CX*1), R9 2654 MOVUPS -16(R14), X0 2655 MOVUPS X0, -16(R9) 2656 JMP copy_4_end 2657 2658 copy_4_small: 2659 CMPQ R13, $0x03 2660 JE copy_4_move_3 2661 CMPQ R13, $0x08 2662 JB copy_4_move_4through7 2663 JMP copy_4_move_8through16 2664 2665 copy_4_move_3: 2666 MOVW (R14), CX 2667 MOVB 2(R14), R12 2668 MOVW CX, (R9) 2669 MOVB R12, 2(R9) 2670 ADDQ R13, R14 2671 ADDQ R13, R9 2672 JMP copy_4_end 2673 2674 copy_4_move_4through7: 2675 MOVL (R14), CX 2676 MOVL -4(R14)(R13*1), R12 2677 MOVL CX, (R9) 2678 MOVL R12, -4(R9)(R13*1) 2679 ADDQ R13, R14 2680 ADDQ R13, R9 2681 JMP copy_4_end 2682 2683 copy_4_move_8through16: 2684 MOVQ (R14), CX 2685 MOVQ -8(R14)(R13*1), R12 2686 MOVQ CX, (R9) 2687 MOVQ R12, -8(R9)(R13*1) 2688 ADDQ R13, R14 2689 ADDQ R13, R9 2690 2691 copy_4_end: 2692 ADDQ R13, R11 2693 JMP handle_loop 2694 JMP loop_finished 2695 2696 copy_all_from_history: 2697 MOVQ CX, R15 2698 SUBQ $0x10, R15 2699 JB copy_5_small 2700 2701 copy_5_loop: 2702 MOVUPS (R14), X0 2703 MOVUPS X0, (R9) 2704 ADDQ $0x10, R14 2705 ADDQ $0x10, R9 2706 SUBQ $0x10, R15 2707 JAE copy_5_loop 2708 LEAQ 16(R14)(R15*1), R14 2709 LEAQ 16(R9)(R15*1), R9 2710 MOVUPS -16(R14), X0 2711 MOVUPS X0, -16(R9) 2712 JMP copy_5_end 2713 2714 copy_5_small: 2715 CMPQ CX, $0x03 2716 JE copy_5_move_3 2717 JB copy_5_move_1or2 2718 CMPQ CX, $0x08 2719 JB copy_5_move_4through7 2720 JMP copy_5_move_8through16 2721 2722 copy_5_move_1or2: 2723 MOVB (R14), R15 2724 MOVB -1(R14)(CX*1), BP 2725 MOVB R15, (R9) 2726 MOVB BP, -1(R9)(CX*1) 2727 ADDQ CX, R14 2728 ADDQ CX, R9 2729 JMP copy_5_end 2730 2731 copy_5_move_3: 2732 MOVW (R14), R15 2733 MOVB 2(R14), BP 2734 MOVW R15, (R9) 2735 MOVB BP, 2(R9) 2736 ADDQ CX, R14 2737 ADDQ CX, R9 2738 JMP copy_5_end 2739 2740 copy_5_move_4through7: 2741 MOVL (R14), R15 2742 MOVL -4(R14)(CX*1), BP 2743 MOVL R15, (R9) 2744 MOVL BP, -4(R9)(CX*1) 2745 ADDQ CX, R14 2746 ADDQ CX, R9 2747 JMP copy_5_end 2748 2749 copy_5_move_8through16: 2750 MOVQ (R14), R15 2751 MOVQ -8(R14)(CX*1), BP 2752 MOVQ R15, (R9) 2753 MOVQ BP, -8(R9)(CX*1) 2754 ADDQ CX, R14 2755 ADDQ CX, R9 2756 2757 copy_5_end: 2758 ADDQ CX, R11 2759 SUBQ CX, R13 2760 2761 // Copy match from the current buffer 2762 copy_match: 2763 MOVQ R9, CX 2764 SUBQ R12, CX 2765 2766 // ml <= mo 2767 CMPQ R13, R12 2768 JA copy_overlapping_match 2769 2770 // Copy non-overlapping match 2771 ADDQ R13, R11 2772 MOVQ R9, R12 2773 ADDQ R13, R9 2774 2775 copy_2: 2776 MOVUPS (CX), X0 2777 MOVUPS X0, (R12) 2778 ADDQ $0x10, CX 2779 ADDQ $0x10, R12 2780 SUBQ $0x10, R13 2781 JHI copy_2 2782 JMP handle_loop 2783 2784 // Copy overlapping match 2785 copy_overlapping_match: 2786 ADDQ R13, R11 2787 2788 copy_slow_3: 2789 MOVB (CX), R12 2790 MOVB R12, (R9) 2791 INCQ CX 2792 INCQ R9 2793 DECQ R13 2794 JNZ copy_slow_3 2795 2796 handle_loop: 2797 MOVQ ctx+16(FP), CX 2798 DECQ 96(CX) 2799 JNS sequenceDecs_decodeSync_bmi2_main_loop 2800 2801 loop_finished: 2802 MOVQ br+8(FP), CX 2803 MOVQ AX, 24(CX) 2804 MOVB DL, 32(CX) 2805 MOVQ BX, 8(CX) 2806 2807 // Update the context 2808 MOVQ ctx+16(FP), AX 2809 MOVQ R11, 136(AX) 2810 MOVQ 144(AX), CX 2811 SUBQ CX, R10 2812 MOVQ R10, 168(AX) 2813 2814 // Return success 2815 MOVQ $0x00000000, ret+24(FP) 2816 RET 2817 2818 // Return with match length error 2819 sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: 2820 MOVQ 16(SP), AX 2821 MOVQ ctx+16(FP), CX 2822 MOVQ AX, 216(CX) 2823 MOVQ $0x00000001, ret+24(FP) 2824 RET 2825 2826 // Return with match too long error 2827 sequenceDecs_decodeSync_bmi2_error_match_len_too_big: 2828 MOVQ ctx+16(FP), AX 2829 MOVQ 16(SP), CX 2830 MOVQ CX, 216(AX) 2831 MOVQ $0x00000002, ret+24(FP) 2832 RET 2833 2834 // Return with match offset too long error 2835 error_match_off_too_big: 2836 MOVQ ctx+16(FP), AX 2837 MOVQ 8(SP), CX 2838 MOVQ CX, 224(AX) 2839 MOVQ R11, 136(AX) 2840 MOVQ $0x00000003, ret+24(FP) 2841 RET 2842 2843 // Return with not enough literals error 2844 error_not_enough_literals: 2845 MOVQ ctx+16(FP), AX 2846 MOVQ 24(SP), CX 2847 MOVQ CX, 208(AX) 2848 MOVQ $0x00000004, ret+24(FP) 2849 RET 2850 2851 // Return with overread error 2852 error_overread: 2853 MOVQ $0x00000006, ret+24(FP) 2854 RET 2855 2856 // Return with not enough output space error 2857 error_not_enough_space: 2858 MOVQ ctx+16(FP), AX 2859 MOVQ 24(SP), CX 2860 MOVQ CX, 208(AX) 2861 MOVQ 16(SP), CX 2862 MOVQ CX, 216(AX) 2863 MOVQ R11, 136(AX) 2864 MOVQ $0x00000005, ret+24(FP) 2865 RET 2866 2867 // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int 2868 // Requires: CMOV, SSE 2869 TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 2870 MOVQ br+8(FP), CX 2871 MOVQ 24(CX), DX 2872 MOVBQZX 32(CX), BX 2873 MOVQ (CX), AX 2874 MOVQ 8(CX), SI 2875 ADDQ SI, AX 2876 MOVQ AX, (SP) 2877 MOVQ ctx+16(FP), AX 2878 MOVQ 72(AX), DI 2879 MOVQ 80(AX), R8 2880 MOVQ 88(AX), R9 2881 XORQ CX, CX 2882 MOVQ CX, 8(SP) 2883 MOVQ CX, 16(SP) 2884 MOVQ CX, 24(SP) 2885 MOVQ 112(AX), R10 2886 MOVQ 128(AX), CX 2887 MOVQ CX, 32(SP) 2888 MOVQ 144(AX), R11 2889 MOVQ 136(AX), R12 2890 MOVQ 200(AX), CX 2891 MOVQ CX, 56(SP) 2892 MOVQ 176(AX), CX 2893 MOVQ CX, 48(SP) 2894 MOVQ 184(AX), AX 2895 MOVQ AX, 40(SP) 2896 MOVQ 40(SP), AX 2897 ADDQ AX, 48(SP) 2898 2899 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) 2900 ADDQ R10, 32(SP) 2901 2902 // outBase += outPosition 2903 ADDQ R12, R10 2904 2905 sequenceDecs_decodeSync_safe_amd64_main_loop: 2906 MOVQ (SP), R13 2907 2908 // Fill bitreader to have enough for the offset and match length. 2909 CMPQ SI, $0x08 2910 JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte 2911 MOVQ BX, AX 2912 SHRQ $0x03, AX 2913 SUBQ AX, R13 2914 MOVQ (R13), DX 2915 SUBQ AX, SI 2916 ANDQ $0x07, BX 2917 JMP sequenceDecs_decodeSync_safe_amd64_fill_end 2918 2919 sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: 2920 CMPQ SI, $0x00 2921 JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread 2922 CMPQ BX, $0x07 2923 JLE sequenceDecs_decodeSync_safe_amd64_fill_end 2924 SHLQ $0x08, DX 2925 SUBQ $0x01, R13 2926 SUBQ $0x01, SI 2927 SUBQ $0x08, BX 2928 MOVBQZX (R13), AX 2929 ORQ AX, DX 2930 JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte 2931 2932 sequenceDecs_decodeSync_safe_amd64_fill_check_overread: 2933 CMPQ BX, $0x40 2934 JA error_overread 2935 2936 sequenceDecs_decodeSync_safe_amd64_fill_end: 2937 // Update offset 2938 MOVQ R9, AX 2939 MOVQ BX, CX 2940 MOVQ DX, R14 2941 SHLQ CL, R14 2942 MOVB AH, CL 2943 SHRQ $0x20, AX 2944 TESTQ CX, CX 2945 JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero 2946 ADDQ CX, BX 2947 CMPQ BX, $0x40 2948 JA sequenceDecs_decodeSync_safe_amd64_of_update_zero 2949 CMPQ CX, $0x40 2950 JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero 2951 NEGQ CX 2952 SHRQ CL, R14 2953 ADDQ R14, AX 2954 2955 sequenceDecs_decodeSync_safe_amd64_of_update_zero: 2956 MOVQ AX, 8(SP) 2957 2958 // Update match length 2959 MOVQ R8, AX 2960 MOVQ BX, CX 2961 MOVQ DX, R14 2962 SHLQ CL, R14 2963 MOVB AH, CL 2964 SHRQ $0x20, AX 2965 TESTQ CX, CX 2966 JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero 2967 ADDQ CX, BX 2968 CMPQ BX, $0x40 2969 JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero 2970 CMPQ CX, $0x40 2971 JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero 2972 NEGQ CX 2973 SHRQ CL, R14 2974 ADDQ R14, AX 2975 2976 sequenceDecs_decodeSync_safe_amd64_ml_update_zero: 2977 MOVQ AX, 16(SP) 2978 2979 // Fill bitreader to have enough for the remaining 2980 CMPQ SI, $0x08 2981 JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte 2982 MOVQ BX, AX 2983 SHRQ $0x03, AX 2984 SUBQ AX, R13 2985 MOVQ (R13), DX 2986 SUBQ AX, SI 2987 ANDQ $0x07, BX 2988 JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end 2989 2990 sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: 2991 CMPQ SI, $0x00 2992 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread 2993 CMPQ BX, $0x07 2994 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end 2995 SHLQ $0x08, DX 2996 SUBQ $0x01, R13 2997 SUBQ $0x01, SI 2998 SUBQ $0x08, BX 2999 MOVBQZX (R13), AX 3000 ORQ AX, DX 3001 JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte 3002 3003 sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread: 3004 CMPQ BX, $0x40 3005 JA error_overread 3006 3007 sequenceDecs_decodeSync_safe_amd64_fill_2_end: 3008 // Update literal length 3009 MOVQ DI, AX 3010 MOVQ BX, CX 3011 MOVQ DX, R14 3012 SHLQ CL, R14 3013 MOVB AH, CL 3014 SHRQ $0x20, AX 3015 TESTQ CX, CX 3016 JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero 3017 ADDQ CX, BX 3018 CMPQ BX, $0x40 3019 JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero 3020 CMPQ CX, $0x40 3021 JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero 3022 NEGQ CX 3023 SHRQ CL, R14 3024 ADDQ R14, AX 3025 3026 sequenceDecs_decodeSync_safe_amd64_ll_update_zero: 3027 MOVQ AX, 24(SP) 3028 3029 // Fill bitreader for state updates 3030 MOVQ R13, (SP) 3031 MOVQ R9, AX 3032 SHRQ $0x08, AX 3033 MOVBQZX AL, AX 3034 MOVQ ctx+16(FP), CX 3035 CMPQ 96(CX), $0x00 3036 JZ sequenceDecs_decodeSync_safe_amd64_skip_update 3037 3038 // Update Literal Length State 3039 MOVBQZX DI, R13 3040 SHRL $0x10, DI 3041 LEAQ (BX)(R13*1), CX 3042 MOVQ DX, R14 3043 MOVQ CX, BX 3044 ROLQ CL, R14 3045 MOVL $0x00000001, R15 3046 MOVB R13, CL 3047 SHLL CL, R15 3048 DECL R15 3049 ANDQ R15, R14 3050 ADDQ R14, DI 3051 3052 // Load ctx.llTable 3053 MOVQ ctx+16(FP), CX 3054 MOVQ (CX), CX 3055 MOVQ (CX)(DI*8), DI 3056 3057 // Update Match Length State 3058 MOVBQZX R8, R13 3059 SHRL $0x10, R8 3060 LEAQ (BX)(R13*1), CX 3061 MOVQ DX, R14 3062 MOVQ CX, BX 3063 ROLQ CL, R14 3064 MOVL $0x00000001, R15 3065 MOVB R13, CL 3066 SHLL CL, R15 3067 DECL R15 3068 ANDQ R15, R14 3069 ADDQ R14, R8 3070 3071 // Load ctx.mlTable 3072 MOVQ ctx+16(FP), CX 3073 MOVQ 24(CX), CX 3074 MOVQ (CX)(R8*8), R8 3075 3076 // Update Offset State 3077 MOVBQZX R9, R13 3078 SHRL $0x10, R9 3079 LEAQ (BX)(R13*1), CX 3080 MOVQ DX, R14 3081 MOVQ CX, BX 3082 ROLQ CL, R14 3083 MOVL $0x00000001, R15 3084 MOVB R13, CL 3085 SHLL CL, R15 3086 DECL R15 3087 ANDQ R15, R14 3088 ADDQ R14, R9 3089 3090 // Load ctx.ofTable 3091 MOVQ ctx+16(FP), CX 3092 MOVQ 48(CX), CX 3093 MOVQ (CX)(R9*8), R9 3094 3095 sequenceDecs_decodeSync_safe_amd64_skip_update: 3096 // Adjust offset 3097 MOVQ s+0(FP), CX 3098 MOVQ 8(SP), R13 3099 CMPQ AX, $0x01 3100 JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 3101 MOVUPS 144(CX), X0 3102 MOVQ R13, 144(CX) 3103 MOVUPS X0, 152(CX) 3104 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust 3105 3106 sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: 3107 CMPQ 24(SP), $0x00000000 3108 JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero 3109 INCQ R13 3110 JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero 3111 3112 sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: 3113 TESTQ R13, R13 3114 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero 3115 MOVQ 144(CX), R13 3116 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust 3117 3118 sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: 3119 MOVQ R13, AX 3120 XORQ R14, R14 3121 MOVQ $-1, R15 3122 CMPQ R13, $0x03 3123 CMOVQEQ R14, AX 3124 CMOVQEQ R15, R14 3125 ADDQ 144(CX)(AX*8), R14 3126 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid 3127 MOVQ $0x00000001, R14 3128 3129 sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: 3130 CMPQ R13, $0x01 3131 JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip 3132 MOVQ 152(CX), AX 3133 MOVQ AX, 160(CX) 3134 3135 sequenceDecs_decodeSync_safe_amd64_adjust_skip: 3136 MOVQ 144(CX), AX 3137 MOVQ AX, 152(CX) 3138 MOVQ R14, 144(CX) 3139 MOVQ R14, R13 3140 3141 sequenceDecs_decodeSync_safe_amd64_after_adjust: 3142 MOVQ R13, 8(SP) 3143 3144 // Check values 3145 MOVQ 16(SP), AX 3146 MOVQ 24(SP), CX 3147 LEAQ (AX)(CX*1), R14 3148 MOVQ s+0(FP), R15 3149 ADDQ R14, 256(R15) 3150 MOVQ ctx+16(FP), R14 3151 SUBQ CX, 104(R14) 3152 JS error_not_enough_literals 3153 CMPQ AX, $0x00020002 3154 JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big 3155 TESTQ R13, R13 3156 JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok 3157 TESTQ AX, AX 3158 JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch 3159 3160 sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: 3161 MOVQ 24(SP), AX 3162 MOVQ 8(SP), CX 3163 MOVQ 16(SP), R13 3164 3165 // Check if we have enough space in s.out 3166 LEAQ (AX)(R13*1), R14 3167 ADDQ R10, R14 3168 CMPQ R14, 32(SP) 3169 JA error_not_enough_space 3170 3171 // Copy literals 3172 TESTQ AX, AX 3173 JZ check_offset 3174 MOVQ AX, R14 3175 SUBQ $0x10, R14 3176 JB copy_1_small 3177 3178 copy_1_loop: 3179 MOVUPS (R11), X0 3180 MOVUPS X0, (R10) 3181 ADDQ $0x10, R11 3182 ADDQ $0x10, R10 3183 SUBQ $0x10, R14 3184 JAE copy_1_loop 3185 LEAQ 16(R11)(R14*1), R11 3186 LEAQ 16(R10)(R14*1), R10 3187 MOVUPS -16(R11), X0 3188 MOVUPS X0, -16(R10) 3189 JMP copy_1_end 3190 3191 copy_1_small: 3192 CMPQ AX, $0x03 3193 JE copy_1_move_3 3194 JB copy_1_move_1or2 3195 CMPQ AX, $0x08 3196 JB copy_1_move_4through7 3197 JMP copy_1_move_8through16 3198 3199 copy_1_move_1or2: 3200 MOVB (R11), R14 3201 MOVB -1(R11)(AX*1), R15 3202 MOVB R14, (R10) 3203 MOVB R15, -1(R10)(AX*1) 3204 ADDQ AX, R11 3205 ADDQ AX, R10 3206 JMP copy_1_end 3207 3208 copy_1_move_3: 3209 MOVW (R11), R14 3210 MOVB 2(R11), R15 3211 MOVW R14, (R10) 3212 MOVB R15, 2(R10) 3213 ADDQ AX, R11 3214 ADDQ AX, R10 3215 JMP copy_1_end 3216 3217 copy_1_move_4through7: 3218 MOVL (R11), R14 3219 MOVL -4(R11)(AX*1), R15 3220 MOVL R14, (R10) 3221 MOVL R15, -4(R10)(AX*1) 3222 ADDQ AX, R11 3223 ADDQ AX, R10 3224 JMP copy_1_end 3225 3226 copy_1_move_8through16: 3227 MOVQ (R11), R14 3228 MOVQ -8(R11)(AX*1), R15 3229 MOVQ R14, (R10) 3230 MOVQ R15, -8(R10)(AX*1) 3231 ADDQ AX, R11 3232 ADDQ AX, R10 3233 3234 copy_1_end: 3235 ADDQ AX, R12 3236 3237 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 3238 check_offset: 3239 MOVQ R12, AX 3240 ADDQ 40(SP), AX 3241 CMPQ CX, AX 3242 JG error_match_off_too_big 3243 CMPQ CX, 56(SP) 3244 JG error_match_off_too_big 3245 3246 // Copy match from history 3247 MOVQ CX, AX 3248 SUBQ R12, AX 3249 JLS copy_match 3250 MOVQ 48(SP), R14 3251 SUBQ AX, R14 3252 CMPQ R13, AX 3253 JG copy_all_from_history 3254 MOVQ R13, AX 3255 SUBQ $0x10, AX 3256 JB copy_4_small 3257 3258 copy_4_loop: 3259 MOVUPS (R14), X0 3260 MOVUPS X0, (R10) 3261 ADDQ $0x10, R14 3262 ADDQ $0x10, R10 3263 SUBQ $0x10, AX 3264 JAE copy_4_loop 3265 LEAQ 16(R14)(AX*1), R14 3266 LEAQ 16(R10)(AX*1), R10 3267 MOVUPS -16(R14), X0 3268 MOVUPS X0, -16(R10) 3269 JMP copy_4_end 3270 3271 copy_4_small: 3272 CMPQ R13, $0x03 3273 JE copy_4_move_3 3274 CMPQ R13, $0x08 3275 JB copy_4_move_4through7 3276 JMP copy_4_move_8through16 3277 3278 copy_4_move_3: 3279 MOVW (R14), AX 3280 MOVB 2(R14), CL 3281 MOVW AX, (R10) 3282 MOVB CL, 2(R10) 3283 ADDQ R13, R14 3284 ADDQ R13, R10 3285 JMP copy_4_end 3286 3287 copy_4_move_4through7: 3288 MOVL (R14), AX 3289 MOVL -4(R14)(R13*1), CX 3290 MOVL AX, (R10) 3291 MOVL CX, -4(R10)(R13*1) 3292 ADDQ R13, R14 3293 ADDQ R13, R10 3294 JMP copy_4_end 3295 3296 copy_4_move_8through16: 3297 MOVQ (R14), AX 3298 MOVQ -8(R14)(R13*1), CX 3299 MOVQ AX, (R10) 3300 MOVQ CX, -8(R10)(R13*1) 3301 ADDQ R13, R14 3302 ADDQ R13, R10 3303 3304 copy_4_end: 3305 ADDQ R13, R12 3306 JMP handle_loop 3307 JMP loop_finished 3308 3309 copy_all_from_history: 3310 MOVQ AX, R15 3311 SUBQ $0x10, R15 3312 JB copy_5_small 3313 3314 copy_5_loop: 3315 MOVUPS (R14), X0 3316 MOVUPS X0, (R10) 3317 ADDQ $0x10, R14 3318 ADDQ $0x10, R10 3319 SUBQ $0x10, R15 3320 JAE copy_5_loop 3321 LEAQ 16(R14)(R15*1), R14 3322 LEAQ 16(R10)(R15*1), R10 3323 MOVUPS -16(R14), X0 3324 MOVUPS X0, -16(R10) 3325 JMP copy_5_end 3326 3327 copy_5_small: 3328 CMPQ AX, $0x03 3329 JE copy_5_move_3 3330 JB copy_5_move_1or2 3331 CMPQ AX, $0x08 3332 JB copy_5_move_4through7 3333 JMP copy_5_move_8through16 3334 3335 copy_5_move_1or2: 3336 MOVB (R14), R15 3337 MOVB -1(R14)(AX*1), BP 3338 MOVB R15, (R10) 3339 MOVB BP, -1(R10)(AX*1) 3340 ADDQ AX, R14 3341 ADDQ AX, R10 3342 JMP copy_5_end 3343 3344 copy_5_move_3: 3345 MOVW (R14), R15 3346 MOVB 2(R14), BP 3347 MOVW R15, (R10) 3348 MOVB BP, 2(R10) 3349 ADDQ AX, R14 3350 ADDQ AX, R10 3351 JMP copy_5_end 3352 3353 copy_5_move_4through7: 3354 MOVL (R14), R15 3355 MOVL -4(R14)(AX*1), BP 3356 MOVL R15, (R10) 3357 MOVL BP, -4(R10)(AX*1) 3358 ADDQ AX, R14 3359 ADDQ AX, R10 3360 JMP copy_5_end 3361 3362 copy_5_move_8through16: 3363 MOVQ (R14), R15 3364 MOVQ -8(R14)(AX*1), BP 3365 MOVQ R15, (R10) 3366 MOVQ BP, -8(R10)(AX*1) 3367 ADDQ AX, R14 3368 ADDQ AX, R10 3369 3370 copy_5_end: 3371 ADDQ AX, R12 3372 SUBQ AX, R13 3373 3374 // Copy match from the current buffer 3375 copy_match: 3376 MOVQ R10, AX 3377 SUBQ CX, AX 3378 3379 // ml <= mo 3380 CMPQ R13, CX 3381 JA copy_overlapping_match 3382 3383 // Copy non-overlapping match 3384 ADDQ R13, R12 3385 MOVQ R13, CX 3386 SUBQ $0x10, CX 3387 JB copy_2_small 3388 3389 copy_2_loop: 3390 MOVUPS (AX), X0 3391 MOVUPS X0, (R10) 3392 ADDQ $0x10, AX 3393 ADDQ $0x10, R10 3394 SUBQ $0x10, CX 3395 JAE copy_2_loop 3396 LEAQ 16(AX)(CX*1), AX 3397 LEAQ 16(R10)(CX*1), R10 3398 MOVUPS -16(AX), X0 3399 MOVUPS X0, -16(R10) 3400 JMP copy_2_end 3401 3402 copy_2_small: 3403 CMPQ R13, $0x03 3404 JE copy_2_move_3 3405 JB copy_2_move_1or2 3406 CMPQ R13, $0x08 3407 JB copy_2_move_4through7 3408 JMP copy_2_move_8through16 3409 3410 copy_2_move_1or2: 3411 MOVB (AX), CL 3412 MOVB -1(AX)(R13*1), R14 3413 MOVB CL, (R10) 3414 MOVB R14, -1(R10)(R13*1) 3415 ADDQ R13, AX 3416 ADDQ R13, R10 3417 JMP copy_2_end 3418 3419 copy_2_move_3: 3420 MOVW (AX), CX 3421 MOVB 2(AX), R14 3422 MOVW CX, (R10) 3423 MOVB R14, 2(R10) 3424 ADDQ R13, AX 3425 ADDQ R13, R10 3426 JMP copy_2_end 3427 3428 copy_2_move_4through7: 3429 MOVL (AX), CX 3430 MOVL -4(AX)(R13*1), R14 3431 MOVL CX, (R10) 3432 MOVL R14, -4(R10)(R13*1) 3433 ADDQ R13, AX 3434 ADDQ R13, R10 3435 JMP copy_2_end 3436 3437 copy_2_move_8through16: 3438 MOVQ (AX), CX 3439 MOVQ -8(AX)(R13*1), R14 3440 MOVQ CX, (R10) 3441 MOVQ R14, -8(R10)(R13*1) 3442 ADDQ R13, AX 3443 ADDQ R13, R10 3444 3445 copy_2_end: 3446 JMP handle_loop 3447 3448 // Copy overlapping match 3449 copy_overlapping_match: 3450 ADDQ R13, R12 3451 3452 copy_slow_3: 3453 MOVB (AX), CL 3454 MOVB CL, (R10) 3455 INCQ AX 3456 INCQ R10 3457 DECQ R13 3458 JNZ copy_slow_3 3459 3460 handle_loop: 3461 MOVQ ctx+16(FP), AX 3462 DECQ 96(AX) 3463 JNS sequenceDecs_decodeSync_safe_amd64_main_loop 3464 3465 loop_finished: 3466 MOVQ br+8(FP), AX 3467 MOVQ DX, 24(AX) 3468 MOVB BL, 32(AX) 3469 MOVQ SI, 8(AX) 3470 3471 // Update the context 3472 MOVQ ctx+16(FP), AX 3473 MOVQ R12, 136(AX) 3474 MOVQ 144(AX), CX 3475 SUBQ CX, R11 3476 MOVQ R11, 168(AX) 3477 3478 // Return success 3479 MOVQ $0x00000000, ret+24(FP) 3480 RET 3481 3482 // Return with match length error 3483 sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: 3484 MOVQ 16(SP), AX 3485 MOVQ ctx+16(FP), CX 3486 MOVQ AX, 216(CX) 3487 MOVQ $0x00000001, ret+24(FP) 3488 RET 3489 3490 // Return with match too long error 3491 sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: 3492 MOVQ ctx+16(FP), AX 3493 MOVQ 16(SP), CX 3494 MOVQ CX, 216(AX) 3495 MOVQ $0x00000002, ret+24(FP) 3496 RET 3497 3498 // Return with match offset too long error 3499 error_match_off_too_big: 3500 MOVQ ctx+16(FP), AX 3501 MOVQ 8(SP), CX 3502 MOVQ CX, 224(AX) 3503 MOVQ R12, 136(AX) 3504 MOVQ $0x00000003, ret+24(FP) 3505 RET 3506 3507 // Return with not enough literals error 3508 error_not_enough_literals: 3509 MOVQ ctx+16(FP), AX 3510 MOVQ 24(SP), CX 3511 MOVQ CX, 208(AX) 3512 MOVQ $0x00000004, ret+24(FP) 3513 RET 3514 3515 // Return with overread error 3516 error_overread: 3517 MOVQ $0x00000006, ret+24(FP) 3518 RET 3519 3520 // Return with not enough output space error 3521 error_not_enough_space: 3522 MOVQ ctx+16(FP), AX 3523 MOVQ 24(SP), CX 3524 MOVQ CX, 208(AX) 3525 MOVQ 16(SP), CX 3526 MOVQ CX, 216(AX) 3527 MOVQ R12, 136(AX) 3528 MOVQ $0x00000005, ret+24(FP) 3529 RET 3530 3531 // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int 3532 // Requires: BMI, BMI2, CMOV, SSE 3533 TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 3534 MOVQ br+8(FP), BX 3535 MOVQ 24(BX), AX 3536 MOVBQZX 32(BX), DX 3537 MOVQ (BX), CX 3538 MOVQ 8(BX), BX 3539 ADDQ BX, CX 3540 MOVQ CX, (SP) 3541 MOVQ ctx+16(FP), CX 3542 MOVQ 72(CX), SI 3543 MOVQ 80(CX), DI 3544 MOVQ 88(CX), R8 3545 XORQ R9, R9 3546 MOVQ R9, 8(SP) 3547 MOVQ R9, 16(SP) 3548 MOVQ R9, 24(SP) 3549 MOVQ 112(CX), R9 3550 MOVQ 128(CX), R10 3551 MOVQ R10, 32(SP) 3552 MOVQ 144(CX), R10 3553 MOVQ 136(CX), R11 3554 MOVQ 200(CX), R12 3555 MOVQ R12, 56(SP) 3556 MOVQ 176(CX), R12 3557 MOVQ R12, 48(SP) 3558 MOVQ 184(CX), CX 3559 MOVQ CX, 40(SP) 3560 MOVQ 40(SP), CX 3561 ADDQ CX, 48(SP) 3562 3563 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) 3564 ADDQ R9, 32(SP) 3565 3566 // outBase += outPosition 3567 ADDQ R11, R9 3568 3569 sequenceDecs_decodeSync_safe_bmi2_main_loop: 3570 MOVQ (SP), R12 3571 3572 // Fill bitreader to have enough for the offset and match length. 3573 CMPQ BX, $0x08 3574 JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte 3575 MOVQ DX, CX 3576 SHRQ $0x03, CX 3577 SUBQ CX, R12 3578 MOVQ (R12), AX 3579 SUBQ CX, BX 3580 ANDQ $0x07, DX 3581 JMP sequenceDecs_decodeSync_safe_bmi2_fill_end 3582 3583 sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: 3584 CMPQ BX, $0x00 3585 JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread 3586 CMPQ DX, $0x07 3587 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end 3588 SHLQ $0x08, AX 3589 SUBQ $0x01, R12 3590 SUBQ $0x01, BX 3591 SUBQ $0x08, DX 3592 MOVBQZX (R12), CX 3593 ORQ CX, AX 3594 JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte 3595 3596 sequenceDecs_decodeSync_safe_bmi2_fill_check_overread: 3597 CMPQ DX, $0x40 3598 JA error_overread 3599 3600 sequenceDecs_decodeSync_safe_bmi2_fill_end: 3601 // Update offset 3602 MOVQ $0x00000808, CX 3603 BEXTRQ CX, R8, R13 3604 MOVQ AX, R14 3605 LEAQ (DX)(R13*1), CX 3606 ROLQ CL, R14 3607 BZHIQ R13, R14, R14 3608 MOVQ CX, DX 3609 MOVQ R8, CX 3610 SHRQ $0x20, CX 3611 ADDQ R14, CX 3612 MOVQ CX, 8(SP) 3613 3614 // Update match length 3615 MOVQ $0x00000808, CX 3616 BEXTRQ CX, DI, R13 3617 MOVQ AX, R14 3618 LEAQ (DX)(R13*1), CX 3619 ROLQ CL, R14 3620 BZHIQ R13, R14, R14 3621 MOVQ CX, DX 3622 MOVQ DI, CX 3623 SHRQ $0x20, CX 3624 ADDQ R14, CX 3625 MOVQ CX, 16(SP) 3626 3627 // Fill bitreader to have enough for the remaining 3628 CMPQ BX, $0x08 3629 JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte 3630 MOVQ DX, CX 3631 SHRQ $0x03, CX 3632 SUBQ CX, R12 3633 MOVQ (R12), AX 3634 SUBQ CX, BX 3635 ANDQ $0x07, DX 3636 JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end 3637 3638 sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: 3639 CMPQ BX, $0x00 3640 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread 3641 CMPQ DX, $0x07 3642 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end 3643 SHLQ $0x08, AX 3644 SUBQ $0x01, R12 3645 SUBQ $0x01, BX 3646 SUBQ $0x08, DX 3647 MOVBQZX (R12), CX 3648 ORQ CX, AX 3649 JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte 3650 3651 sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread: 3652 CMPQ DX, $0x40 3653 JA error_overread 3654 3655 sequenceDecs_decodeSync_safe_bmi2_fill_2_end: 3656 // Update literal length 3657 MOVQ $0x00000808, CX 3658 BEXTRQ CX, SI, R13 3659 MOVQ AX, R14 3660 LEAQ (DX)(R13*1), CX 3661 ROLQ CL, R14 3662 BZHIQ R13, R14, R14 3663 MOVQ CX, DX 3664 MOVQ SI, CX 3665 SHRQ $0x20, CX 3666 ADDQ R14, CX 3667 MOVQ CX, 24(SP) 3668 3669 // Fill bitreader for state updates 3670 MOVQ R12, (SP) 3671 MOVQ $0x00000808, CX 3672 BEXTRQ CX, R8, R12 3673 MOVQ ctx+16(FP), CX 3674 CMPQ 96(CX), $0x00 3675 JZ sequenceDecs_decodeSync_safe_bmi2_skip_update 3676 LEAQ (SI)(DI*1), R13 3677 ADDQ R8, R13 3678 MOVBQZX R13, R13 3679 LEAQ (DX)(R13*1), CX 3680 MOVQ AX, R14 3681 MOVQ CX, DX 3682 ROLQ CL, R14 3683 BZHIQ R13, R14, R14 3684 3685 // Update Offset State 3686 BZHIQ R8, R14, CX 3687 SHRXQ R8, R14, R14 3688 SHRL $0x10, R8 3689 ADDQ CX, R8 3690 3691 // Load ctx.ofTable 3692 MOVQ ctx+16(FP), CX 3693 MOVQ 48(CX), CX 3694 MOVQ (CX)(R8*8), R8 3695 3696 // Update Match Length State 3697 BZHIQ DI, R14, CX 3698 SHRXQ DI, R14, R14 3699 SHRL $0x10, DI 3700 ADDQ CX, DI 3701 3702 // Load ctx.mlTable 3703 MOVQ ctx+16(FP), CX 3704 MOVQ 24(CX), CX 3705 MOVQ (CX)(DI*8), DI 3706 3707 // Update Literal Length State 3708 BZHIQ SI, R14, CX 3709 SHRL $0x10, SI 3710 ADDQ CX, SI 3711 3712 // Load ctx.llTable 3713 MOVQ ctx+16(FP), CX 3714 MOVQ (CX), CX 3715 MOVQ (CX)(SI*8), SI 3716 3717 sequenceDecs_decodeSync_safe_bmi2_skip_update: 3718 // Adjust offset 3719 MOVQ s+0(FP), CX 3720 MOVQ 8(SP), R13 3721 CMPQ R12, $0x01 3722 JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 3723 MOVUPS 144(CX), X0 3724 MOVQ R13, 144(CX) 3725 MOVUPS X0, 152(CX) 3726 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust 3727 3728 sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: 3729 CMPQ 24(SP), $0x00000000 3730 JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero 3731 INCQ R13 3732 JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero 3733 3734 sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: 3735 TESTQ R13, R13 3736 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero 3737 MOVQ 144(CX), R13 3738 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust 3739 3740 sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: 3741 MOVQ R13, R12 3742 XORQ R14, R14 3743 MOVQ $-1, R15 3744 CMPQ R13, $0x03 3745 CMOVQEQ R14, R12 3746 CMOVQEQ R15, R14 3747 ADDQ 144(CX)(R12*8), R14 3748 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid 3749 MOVQ $0x00000001, R14 3750 3751 sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: 3752 CMPQ R13, $0x01 3753 JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip 3754 MOVQ 152(CX), R12 3755 MOVQ R12, 160(CX) 3756 3757 sequenceDecs_decodeSync_safe_bmi2_adjust_skip: 3758 MOVQ 144(CX), R12 3759 MOVQ R12, 152(CX) 3760 MOVQ R14, 144(CX) 3761 MOVQ R14, R13 3762 3763 sequenceDecs_decodeSync_safe_bmi2_after_adjust: 3764 MOVQ R13, 8(SP) 3765 3766 // Check values 3767 MOVQ 16(SP), CX 3768 MOVQ 24(SP), R12 3769 LEAQ (CX)(R12*1), R14 3770 MOVQ s+0(FP), R15 3771 ADDQ R14, 256(R15) 3772 MOVQ ctx+16(FP), R14 3773 SUBQ R12, 104(R14) 3774 JS error_not_enough_literals 3775 CMPQ CX, $0x00020002 3776 JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big 3777 TESTQ R13, R13 3778 JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok 3779 TESTQ CX, CX 3780 JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch 3781 3782 sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: 3783 MOVQ 24(SP), CX 3784 MOVQ 8(SP), R12 3785 MOVQ 16(SP), R13 3786 3787 // Check if we have enough space in s.out 3788 LEAQ (CX)(R13*1), R14 3789 ADDQ R9, R14 3790 CMPQ R14, 32(SP) 3791 JA error_not_enough_space 3792 3793 // Copy literals 3794 TESTQ CX, CX 3795 JZ check_offset 3796 MOVQ CX, R14 3797 SUBQ $0x10, R14 3798 JB copy_1_small 3799 3800 copy_1_loop: 3801 MOVUPS (R10), X0 3802 MOVUPS X0, (R9) 3803 ADDQ $0x10, R10 3804 ADDQ $0x10, R9 3805 SUBQ $0x10, R14 3806 JAE copy_1_loop 3807 LEAQ 16(R10)(R14*1), R10 3808 LEAQ 16(R9)(R14*1), R9 3809 MOVUPS -16(R10), X0 3810 MOVUPS X0, -16(R9) 3811 JMP copy_1_end 3812 3813 copy_1_small: 3814 CMPQ CX, $0x03 3815 JE copy_1_move_3 3816 JB copy_1_move_1or2 3817 CMPQ CX, $0x08 3818 JB copy_1_move_4through7 3819 JMP copy_1_move_8through16 3820 3821 copy_1_move_1or2: 3822 MOVB (R10), R14 3823 MOVB -1(R10)(CX*1), R15 3824 MOVB R14, (R9) 3825 MOVB R15, -1(R9)(CX*1) 3826 ADDQ CX, R10 3827 ADDQ CX, R9 3828 JMP copy_1_end 3829 3830 copy_1_move_3: 3831 MOVW (R10), R14 3832 MOVB 2(R10), R15 3833 MOVW R14, (R9) 3834 MOVB R15, 2(R9) 3835 ADDQ CX, R10 3836 ADDQ CX, R9 3837 JMP copy_1_end 3838 3839 copy_1_move_4through7: 3840 MOVL (R10), R14 3841 MOVL -4(R10)(CX*1), R15 3842 MOVL R14, (R9) 3843 MOVL R15, -4(R9)(CX*1) 3844 ADDQ CX, R10 3845 ADDQ CX, R9 3846 JMP copy_1_end 3847 3848 copy_1_move_8through16: 3849 MOVQ (R10), R14 3850 MOVQ -8(R10)(CX*1), R15 3851 MOVQ R14, (R9) 3852 MOVQ R15, -8(R9)(CX*1) 3853 ADDQ CX, R10 3854 ADDQ CX, R9 3855 3856 copy_1_end: 3857 ADDQ CX, R11 3858 3859 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) 3860 check_offset: 3861 MOVQ R11, CX 3862 ADDQ 40(SP), CX 3863 CMPQ R12, CX 3864 JG error_match_off_too_big 3865 CMPQ R12, 56(SP) 3866 JG error_match_off_too_big 3867 3868 // Copy match from history 3869 MOVQ R12, CX 3870 SUBQ R11, CX 3871 JLS copy_match 3872 MOVQ 48(SP), R14 3873 SUBQ CX, R14 3874 CMPQ R13, CX 3875 JG copy_all_from_history 3876 MOVQ R13, CX 3877 SUBQ $0x10, CX 3878 JB copy_4_small 3879 3880 copy_4_loop: 3881 MOVUPS (R14), X0 3882 MOVUPS X0, (R9) 3883 ADDQ $0x10, R14 3884 ADDQ $0x10, R9 3885 SUBQ $0x10, CX 3886 JAE copy_4_loop 3887 LEAQ 16(R14)(CX*1), R14 3888 LEAQ 16(R9)(CX*1), R9 3889 MOVUPS -16(R14), X0 3890 MOVUPS X0, -16(R9) 3891 JMP copy_4_end 3892 3893 copy_4_small: 3894 CMPQ R13, $0x03 3895 JE copy_4_move_3 3896 CMPQ R13, $0x08 3897 JB copy_4_move_4through7 3898 JMP copy_4_move_8through16 3899 3900 copy_4_move_3: 3901 MOVW (R14), CX 3902 MOVB 2(R14), R12 3903 MOVW CX, (R9) 3904 MOVB R12, 2(R9) 3905 ADDQ R13, R14 3906 ADDQ R13, R9 3907 JMP copy_4_end 3908 3909 copy_4_move_4through7: 3910 MOVL (R14), CX 3911 MOVL -4(R14)(R13*1), R12 3912 MOVL CX, (R9) 3913 MOVL R12, -4(R9)(R13*1) 3914 ADDQ R13, R14 3915 ADDQ R13, R9 3916 JMP copy_4_end 3917 3918 copy_4_move_8through16: 3919 MOVQ (R14), CX 3920 MOVQ -8(R14)(R13*1), R12 3921 MOVQ CX, (R9) 3922 MOVQ R12, -8(R9)(R13*1) 3923 ADDQ R13, R14 3924 ADDQ R13, R9 3925 3926 copy_4_end: 3927 ADDQ R13, R11 3928 JMP handle_loop 3929 JMP loop_finished 3930 3931 copy_all_from_history: 3932 MOVQ CX, R15 3933 SUBQ $0x10, R15 3934 JB copy_5_small 3935 3936 copy_5_loop: 3937 MOVUPS (R14), X0 3938 MOVUPS X0, (R9) 3939 ADDQ $0x10, R14 3940 ADDQ $0x10, R9 3941 SUBQ $0x10, R15 3942 JAE copy_5_loop 3943 LEAQ 16(R14)(R15*1), R14 3944 LEAQ 16(R9)(R15*1), R9 3945 MOVUPS -16(R14), X0 3946 MOVUPS X0, -16(R9) 3947 JMP copy_5_end 3948 3949 copy_5_small: 3950 CMPQ CX, $0x03 3951 JE copy_5_move_3 3952 JB copy_5_move_1or2 3953 CMPQ CX, $0x08 3954 JB copy_5_move_4through7 3955 JMP copy_5_move_8through16 3956 3957 copy_5_move_1or2: 3958 MOVB (R14), R15 3959 MOVB -1(R14)(CX*1), BP 3960 MOVB R15, (R9) 3961 MOVB BP, -1(R9)(CX*1) 3962 ADDQ CX, R14 3963 ADDQ CX, R9 3964 JMP copy_5_end 3965 3966 copy_5_move_3: 3967 MOVW (R14), R15 3968 MOVB 2(R14), BP 3969 MOVW R15, (R9) 3970 MOVB BP, 2(R9) 3971 ADDQ CX, R14 3972 ADDQ CX, R9 3973 JMP copy_5_end 3974 3975 copy_5_move_4through7: 3976 MOVL (R14), R15 3977 MOVL -4(R14)(CX*1), BP 3978 MOVL R15, (R9) 3979 MOVL BP, -4(R9)(CX*1) 3980 ADDQ CX, R14 3981 ADDQ CX, R9 3982 JMP copy_5_end 3983 3984 copy_5_move_8through16: 3985 MOVQ (R14), R15 3986 MOVQ -8(R14)(CX*1), BP 3987 MOVQ R15, (R9) 3988 MOVQ BP, -8(R9)(CX*1) 3989 ADDQ CX, R14 3990 ADDQ CX, R9 3991 3992 copy_5_end: 3993 ADDQ CX, R11 3994 SUBQ CX, R13 3995 3996 // Copy match from the current buffer 3997 copy_match: 3998 MOVQ R9, CX 3999 SUBQ R12, CX 4000 4001 // ml <= mo 4002 CMPQ R13, R12 4003 JA copy_overlapping_match 4004 4005 // Copy non-overlapping match 4006 ADDQ R13, R11 4007 MOVQ R13, R12 4008 SUBQ $0x10, R12 4009 JB copy_2_small 4010 4011 copy_2_loop: 4012 MOVUPS (CX), X0 4013 MOVUPS X0, (R9) 4014 ADDQ $0x10, CX 4015 ADDQ $0x10, R9 4016 SUBQ $0x10, R12 4017 JAE copy_2_loop 4018 LEAQ 16(CX)(R12*1), CX 4019 LEAQ 16(R9)(R12*1), R9 4020 MOVUPS -16(CX), X0 4021 MOVUPS X0, -16(R9) 4022 JMP copy_2_end 4023 4024 copy_2_small: 4025 CMPQ R13, $0x03 4026 JE copy_2_move_3 4027 JB copy_2_move_1or2 4028 CMPQ R13, $0x08 4029 JB copy_2_move_4through7 4030 JMP copy_2_move_8through16 4031 4032 copy_2_move_1or2: 4033 MOVB (CX), R12 4034 MOVB -1(CX)(R13*1), R14 4035 MOVB R12, (R9) 4036 MOVB R14, -1(R9)(R13*1) 4037 ADDQ R13, CX 4038 ADDQ R13, R9 4039 JMP copy_2_end 4040 4041 copy_2_move_3: 4042 MOVW (CX), R12 4043 MOVB 2(CX), R14 4044 MOVW R12, (R9) 4045 MOVB R14, 2(R9) 4046 ADDQ R13, CX 4047 ADDQ R13, R9 4048 JMP copy_2_end 4049 4050 copy_2_move_4through7: 4051 MOVL (CX), R12 4052 MOVL -4(CX)(R13*1), R14 4053 MOVL R12, (R9) 4054 MOVL R14, -4(R9)(R13*1) 4055 ADDQ R13, CX 4056 ADDQ R13, R9 4057 JMP copy_2_end 4058 4059 copy_2_move_8through16: 4060 MOVQ (CX), R12 4061 MOVQ -8(CX)(R13*1), R14 4062 MOVQ R12, (R9) 4063 MOVQ R14, -8(R9)(R13*1) 4064 ADDQ R13, CX 4065 ADDQ R13, R9 4066 4067 copy_2_end: 4068 JMP handle_loop 4069 4070 // Copy overlapping match 4071 copy_overlapping_match: 4072 ADDQ R13, R11 4073 4074 copy_slow_3: 4075 MOVB (CX), R12 4076 MOVB R12, (R9) 4077 INCQ CX 4078 INCQ R9 4079 DECQ R13 4080 JNZ copy_slow_3 4081 4082 handle_loop: 4083 MOVQ ctx+16(FP), CX 4084 DECQ 96(CX) 4085 JNS sequenceDecs_decodeSync_safe_bmi2_main_loop 4086 4087 loop_finished: 4088 MOVQ br+8(FP), CX 4089 MOVQ AX, 24(CX) 4090 MOVB DL, 32(CX) 4091 MOVQ BX, 8(CX) 4092 4093 // Update the context 4094 MOVQ ctx+16(FP), AX 4095 MOVQ R11, 136(AX) 4096 MOVQ 144(AX), CX 4097 SUBQ CX, R10 4098 MOVQ R10, 168(AX) 4099 4100 // Return success 4101 MOVQ $0x00000000, ret+24(FP) 4102 RET 4103 4104 // Return with match length error 4105 sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: 4106 MOVQ 16(SP), AX 4107 MOVQ ctx+16(FP), CX 4108 MOVQ AX, 216(CX) 4109 MOVQ $0x00000001, ret+24(FP) 4110 RET 4111 4112 // Return with match too long error 4113 sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: 4114 MOVQ ctx+16(FP), AX 4115 MOVQ 16(SP), CX 4116 MOVQ CX, 216(AX) 4117 MOVQ $0x00000002, ret+24(FP) 4118 RET 4119 4120 // Return with match offset too long error 4121 error_match_off_too_big: 4122 MOVQ ctx+16(FP), AX 4123 MOVQ 8(SP), CX 4124 MOVQ CX, 224(AX) 4125 MOVQ R11, 136(AX) 4126 MOVQ $0x00000003, ret+24(FP) 4127 RET 4128 4129 // Return with not enough literals error 4130 error_not_enough_literals: 4131 MOVQ ctx+16(FP), AX 4132 MOVQ 24(SP), CX 4133 MOVQ CX, 208(AX) 4134 MOVQ $0x00000004, ret+24(FP) 4135 RET 4136 4137 // Return with overread error 4138 error_overread: 4139 MOVQ $0x00000006, ret+24(FP) 4140 RET 4141 4142 // Return with not enough output space error 4143 error_not_enough_space: 4144 MOVQ ctx+16(FP), AX 4145 MOVQ 24(SP), CX 4146 MOVQ CX, 208(AX) 4147 MOVQ 16(SP), CX 4148 MOVQ CX, 216(AX) 4149 MOVQ R11, 136(AX) 4150 MOVQ $0x00000005, ret+24(FP) 4151 RET