github.com/mdaxf/iac@v0.0.0-20240519030858-58a061660378/vendor_skip/github.com/klauspost/compress/zstd/seqdec_amd64.s (about)

     1  // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
     2  
     3  //go:build !appengine && !noasm && gc && !noasm
     4  
     5  // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
     6  // Requires: CMOV
     7  TEXT ·sequenceDecs_decode_amd64(SB), $8-32
     8  	MOVQ    br+8(FP), CX
     9  	MOVQ    24(CX), DX
    10  	MOVBQZX 32(CX), BX
    11  	MOVQ    (CX), AX
    12  	MOVQ    8(CX), SI
    13  	ADDQ    SI, AX
    14  	MOVQ    AX, (SP)
    15  	MOVQ    ctx+16(FP), AX
    16  	MOVQ    72(AX), DI
    17  	MOVQ    80(AX), R8
    18  	MOVQ    88(AX), R9
    19  	MOVQ    104(AX), R10
    20  	MOVQ    s+0(FP), AX
    21  	MOVQ    144(AX), R11
    22  	MOVQ    152(AX), R12
    23  	MOVQ    160(AX), R13
    24  
    25  sequenceDecs_decode_amd64_main_loop:
    26  	MOVQ (SP), R14
    27  
    28  	// Fill bitreader to have enough for the offset and match length.
    29  	CMPQ SI, $0x08
    30  	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
    31  	MOVQ BX, AX
    32  	SHRQ $0x03, AX
    33  	SUBQ AX, R14
    34  	MOVQ (R14), DX
    35  	SUBQ AX, SI
    36  	ANDQ $0x07, BX
    37  	JMP  sequenceDecs_decode_amd64_fill_end
    38  
    39  sequenceDecs_decode_amd64_fill_byte_by_byte:
    40  	CMPQ    SI, $0x00
    41  	JLE     sequenceDecs_decode_amd64_fill_check_overread
    42  	CMPQ    BX, $0x07
    43  	JLE     sequenceDecs_decode_amd64_fill_end
    44  	SHLQ    $0x08, DX
    45  	SUBQ    $0x01, R14
    46  	SUBQ    $0x01, SI
    47  	SUBQ    $0x08, BX
    48  	MOVBQZX (R14), AX
    49  	ORQ     AX, DX
    50  	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
    51  
    52  sequenceDecs_decode_amd64_fill_check_overread:
    53  	CMPQ BX, $0x40
    54  	JA   error_overread
    55  
    56  sequenceDecs_decode_amd64_fill_end:
    57  	// Update offset
    58  	MOVQ  R9, AX
    59  	MOVQ  BX, CX
    60  	MOVQ  DX, R15
    61  	SHLQ  CL, R15
    62  	MOVB  AH, CL
    63  	SHRQ  $0x20, AX
    64  	TESTQ CX, CX
    65  	JZ    sequenceDecs_decode_amd64_of_update_zero
    66  	ADDQ  CX, BX
    67  	CMPQ  BX, $0x40
    68  	JA    sequenceDecs_decode_amd64_of_update_zero
    69  	CMPQ  CX, $0x40
    70  	JAE   sequenceDecs_decode_amd64_of_update_zero
    71  	NEGQ  CX
    72  	SHRQ  CL, R15
    73  	ADDQ  R15, AX
    74  
    75  sequenceDecs_decode_amd64_of_update_zero:
    76  	MOVQ AX, 16(R10)
    77  
    78  	// Update match length
    79  	MOVQ  R8, AX
    80  	MOVQ  BX, CX
    81  	MOVQ  DX, R15
    82  	SHLQ  CL, R15
    83  	MOVB  AH, CL
    84  	SHRQ  $0x20, AX
    85  	TESTQ CX, CX
    86  	JZ    sequenceDecs_decode_amd64_ml_update_zero
    87  	ADDQ  CX, BX
    88  	CMPQ  BX, $0x40
    89  	JA    sequenceDecs_decode_amd64_ml_update_zero
    90  	CMPQ  CX, $0x40
    91  	JAE   sequenceDecs_decode_amd64_ml_update_zero
    92  	NEGQ  CX
    93  	SHRQ  CL, R15
    94  	ADDQ  R15, AX
    95  
    96  sequenceDecs_decode_amd64_ml_update_zero:
    97  	MOVQ AX, 8(R10)
    98  
    99  	// Fill bitreader to have enough for the remaining
   100  	CMPQ SI, $0x08
   101  	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
   102  	MOVQ BX, AX
   103  	SHRQ $0x03, AX
   104  	SUBQ AX, R14
   105  	MOVQ (R14), DX
   106  	SUBQ AX, SI
   107  	ANDQ $0x07, BX
   108  	JMP  sequenceDecs_decode_amd64_fill_2_end
   109  
   110  sequenceDecs_decode_amd64_fill_2_byte_by_byte:
   111  	CMPQ    SI, $0x00
   112  	JLE     sequenceDecs_decode_amd64_fill_2_check_overread
   113  	CMPQ    BX, $0x07
   114  	JLE     sequenceDecs_decode_amd64_fill_2_end
   115  	SHLQ    $0x08, DX
   116  	SUBQ    $0x01, R14
   117  	SUBQ    $0x01, SI
   118  	SUBQ    $0x08, BX
   119  	MOVBQZX (R14), AX
   120  	ORQ     AX, DX
   121  	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
   122  
   123  sequenceDecs_decode_amd64_fill_2_check_overread:
   124  	CMPQ BX, $0x40
   125  	JA   error_overread
   126  
   127  sequenceDecs_decode_amd64_fill_2_end:
   128  	// Update literal length
   129  	MOVQ  DI, AX
   130  	MOVQ  BX, CX
   131  	MOVQ  DX, R15
   132  	SHLQ  CL, R15
   133  	MOVB  AH, CL
   134  	SHRQ  $0x20, AX
   135  	TESTQ CX, CX
   136  	JZ    sequenceDecs_decode_amd64_ll_update_zero
   137  	ADDQ  CX, BX
   138  	CMPQ  BX, $0x40
   139  	JA    sequenceDecs_decode_amd64_ll_update_zero
   140  	CMPQ  CX, $0x40
   141  	JAE   sequenceDecs_decode_amd64_ll_update_zero
   142  	NEGQ  CX
   143  	SHRQ  CL, R15
   144  	ADDQ  R15, AX
   145  
   146  sequenceDecs_decode_amd64_ll_update_zero:
   147  	MOVQ AX, (R10)
   148  
   149  	// Fill bitreader for state updates
   150  	MOVQ    R14, (SP)
   151  	MOVQ    R9, AX
   152  	SHRQ    $0x08, AX
   153  	MOVBQZX AL, AX
   154  	MOVQ    ctx+16(FP), CX
   155  	CMPQ    96(CX), $0x00
   156  	JZ      sequenceDecs_decode_amd64_skip_update
   157  
   158  	// Update Literal Length State
   159  	MOVBQZX DI, R14
   160  	SHRL    $0x10, DI
   161  	LEAQ    (BX)(R14*1), CX
   162  	MOVQ    DX, R15
   163  	MOVQ    CX, BX
   164  	ROLQ    CL, R15
   165  	MOVL    $0x00000001, BP
   166  	MOVB    R14, CL
   167  	SHLL    CL, BP
   168  	DECL    BP
   169  	ANDQ    BP, R15
   170  	ADDQ    R15, DI
   171  
   172  	// Load ctx.llTable
   173  	MOVQ ctx+16(FP), CX
   174  	MOVQ (CX), CX
   175  	MOVQ (CX)(DI*8), DI
   176  
   177  	// Update Match Length State
   178  	MOVBQZX R8, R14
   179  	SHRL    $0x10, R8
   180  	LEAQ    (BX)(R14*1), CX
   181  	MOVQ    DX, R15
   182  	MOVQ    CX, BX
   183  	ROLQ    CL, R15
   184  	MOVL    $0x00000001, BP
   185  	MOVB    R14, CL
   186  	SHLL    CL, BP
   187  	DECL    BP
   188  	ANDQ    BP, R15
   189  	ADDQ    R15, R8
   190  
   191  	// Load ctx.mlTable
   192  	MOVQ ctx+16(FP), CX
   193  	MOVQ 24(CX), CX
   194  	MOVQ (CX)(R8*8), R8
   195  
   196  	// Update Offset State
   197  	MOVBQZX R9, R14
   198  	SHRL    $0x10, R9
   199  	LEAQ    (BX)(R14*1), CX
   200  	MOVQ    DX, R15
   201  	MOVQ    CX, BX
   202  	ROLQ    CL, R15
   203  	MOVL    $0x00000001, BP
   204  	MOVB    R14, CL
   205  	SHLL    CL, BP
   206  	DECL    BP
   207  	ANDQ    BP, R15
   208  	ADDQ    R15, R9
   209  
   210  	// Load ctx.ofTable
   211  	MOVQ ctx+16(FP), CX
   212  	MOVQ 48(CX), CX
   213  	MOVQ (CX)(R9*8), R9
   214  
   215  sequenceDecs_decode_amd64_skip_update:
   216  	// Adjust offset
   217  	MOVQ 16(R10), CX
   218  	CMPQ AX, $0x01
   219  	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
   220  	MOVQ R12, R13
   221  	MOVQ R11, R12
   222  	MOVQ CX, R11
   223  	JMP  sequenceDecs_decode_amd64_after_adjust
   224  
   225  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
   226  	CMPQ (R10), $0x00000000
   227  	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
   228  	INCQ CX
   229  	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
   230  
   231  sequenceDecs_decode_amd64_adjust_offset_maybezero:
   232  	TESTQ CX, CX
   233  	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
   234  	MOVQ  R11, CX
   235  	JMP   sequenceDecs_decode_amd64_after_adjust
   236  
   237  sequenceDecs_decode_amd64_adjust_offset_nonzero:
   238  	CMPQ CX, $0x01
   239  	JB   sequenceDecs_decode_amd64_adjust_zero
   240  	JEQ  sequenceDecs_decode_amd64_adjust_one
   241  	CMPQ CX, $0x02
   242  	JA   sequenceDecs_decode_amd64_adjust_three
   243  	JMP  sequenceDecs_decode_amd64_adjust_two
   244  
   245  sequenceDecs_decode_amd64_adjust_zero:
   246  	MOVQ R11, AX
   247  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   248  
   249  sequenceDecs_decode_amd64_adjust_one:
   250  	MOVQ R12, AX
   251  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   252  
   253  sequenceDecs_decode_amd64_adjust_two:
   254  	MOVQ R13, AX
   255  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   256  
   257  sequenceDecs_decode_amd64_adjust_three:
   258  	LEAQ -1(R11), AX
   259  
   260  sequenceDecs_decode_amd64_adjust_test_temp_valid:
   261  	TESTQ AX, AX
   262  	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
   263  	MOVQ  $0x00000001, AX
   264  
   265  sequenceDecs_decode_amd64_adjust_temp_valid:
   266  	CMPQ    CX, $0x01
   267  	CMOVQNE R12, R13
   268  	MOVQ    R11, R12
   269  	MOVQ    AX, R11
   270  	MOVQ    AX, CX
   271  
   272  sequenceDecs_decode_amd64_after_adjust:
   273  	MOVQ CX, 16(R10)
   274  
   275  	// Check values
   276  	MOVQ  8(R10), AX
   277  	MOVQ  (R10), R14
   278  	LEAQ  (AX)(R14*1), R15
   279  	MOVQ  s+0(FP), BP
   280  	ADDQ  R15, 256(BP)
   281  	MOVQ  ctx+16(FP), R15
   282  	SUBQ  R14, 128(R15)
   283  	JS    error_not_enough_literals
   284  	CMPQ  AX, $0x00020002
   285  	JA    sequenceDecs_decode_amd64_error_match_len_too_big
   286  	TESTQ CX, CX
   287  	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
   288  	TESTQ AX, AX
   289  	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
   290  
   291  sequenceDecs_decode_amd64_match_len_ofs_ok:
   292  	ADDQ $0x18, R10
   293  	MOVQ ctx+16(FP), AX
   294  	DECQ 96(AX)
   295  	JNS  sequenceDecs_decode_amd64_main_loop
   296  	MOVQ s+0(FP), AX
   297  	MOVQ R11, 144(AX)
   298  	MOVQ R12, 152(AX)
   299  	MOVQ R13, 160(AX)
   300  	MOVQ br+8(FP), AX
   301  	MOVQ DX, 24(AX)
   302  	MOVB BL, 32(AX)
   303  	MOVQ SI, 8(AX)
   304  
   305  	// Return success
   306  	MOVQ $0x00000000, ret+24(FP)
   307  	RET
   308  
   309  	// Return with match length error
   310  sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
   311  	MOVQ $0x00000001, ret+24(FP)
   312  	RET
   313  
   314  	// Return with match too long error
   315  sequenceDecs_decode_amd64_error_match_len_too_big:
   316  	MOVQ $0x00000002, ret+24(FP)
   317  	RET
   318  
   319  	// Return with match offset too long error
   320  	MOVQ $0x00000003, ret+24(FP)
   321  	RET
   322  
   323  	// Return with not enough literals error
   324  error_not_enough_literals:
   325  	MOVQ $0x00000004, ret+24(FP)
   326  	RET
   327  
   328  	// Return with overread error
   329  error_overread:
   330  	MOVQ $0x00000006, ret+24(FP)
   331  	RET
   332  
   333  // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   334  // Requires: CMOV
   335  TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
   336  	MOVQ    br+8(FP), CX
   337  	MOVQ    24(CX), DX
   338  	MOVBQZX 32(CX), BX
   339  	MOVQ    (CX), AX
   340  	MOVQ    8(CX), SI
   341  	ADDQ    SI, AX
   342  	MOVQ    AX, (SP)
   343  	MOVQ    ctx+16(FP), AX
   344  	MOVQ    72(AX), DI
   345  	MOVQ    80(AX), R8
   346  	MOVQ    88(AX), R9
   347  	MOVQ    104(AX), R10
   348  	MOVQ    s+0(FP), AX
   349  	MOVQ    144(AX), R11
   350  	MOVQ    152(AX), R12
   351  	MOVQ    160(AX), R13
   352  
   353  sequenceDecs_decode_56_amd64_main_loop:
   354  	MOVQ (SP), R14
   355  
   356  	// Fill bitreader to have enough for the offset and match length.
   357  	CMPQ SI, $0x08
   358  	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
   359  	MOVQ BX, AX
   360  	SHRQ $0x03, AX
   361  	SUBQ AX, R14
   362  	MOVQ (R14), DX
   363  	SUBQ AX, SI
   364  	ANDQ $0x07, BX
   365  	JMP  sequenceDecs_decode_56_amd64_fill_end
   366  
   367  sequenceDecs_decode_56_amd64_fill_byte_by_byte:
   368  	CMPQ    SI, $0x00
   369  	JLE     sequenceDecs_decode_56_amd64_fill_check_overread
   370  	CMPQ    BX, $0x07
   371  	JLE     sequenceDecs_decode_56_amd64_fill_end
   372  	SHLQ    $0x08, DX
   373  	SUBQ    $0x01, R14
   374  	SUBQ    $0x01, SI
   375  	SUBQ    $0x08, BX
   376  	MOVBQZX (R14), AX
   377  	ORQ     AX, DX
   378  	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
   379  
   380  sequenceDecs_decode_56_amd64_fill_check_overread:
   381  	CMPQ BX, $0x40
   382  	JA   error_overread
   383  
   384  sequenceDecs_decode_56_amd64_fill_end:
   385  	// Update offset
   386  	MOVQ  R9, AX
   387  	MOVQ  BX, CX
   388  	MOVQ  DX, R15
   389  	SHLQ  CL, R15
   390  	MOVB  AH, CL
   391  	SHRQ  $0x20, AX
   392  	TESTQ CX, CX
   393  	JZ    sequenceDecs_decode_56_amd64_of_update_zero
   394  	ADDQ  CX, BX
   395  	CMPQ  BX, $0x40
   396  	JA    sequenceDecs_decode_56_amd64_of_update_zero
   397  	CMPQ  CX, $0x40
   398  	JAE   sequenceDecs_decode_56_amd64_of_update_zero
   399  	NEGQ  CX
   400  	SHRQ  CL, R15
   401  	ADDQ  R15, AX
   402  
   403  sequenceDecs_decode_56_amd64_of_update_zero:
   404  	MOVQ AX, 16(R10)
   405  
   406  	// Update match length
   407  	MOVQ  R8, AX
   408  	MOVQ  BX, CX
   409  	MOVQ  DX, R15
   410  	SHLQ  CL, R15
   411  	MOVB  AH, CL
   412  	SHRQ  $0x20, AX
   413  	TESTQ CX, CX
   414  	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
   415  	ADDQ  CX, BX
   416  	CMPQ  BX, $0x40
   417  	JA    sequenceDecs_decode_56_amd64_ml_update_zero
   418  	CMPQ  CX, $0x40
   419  	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
   420  	NEGQ  CX
   421  	SHRQ  CL, R15
   422  	ADDQ  R15, AX
   423  
   424  sequenceDecs_decode_56_amd64_ml_update_zero:
   425  	MOVQ AX, 8(R10)
   426  
   427  	// Update literal length
   428  	MOVQ  DI, AX
   429  	MOVQ  BX, CX
   430  	MOVQ  DX, R15
   431  	SHLQ  CL, R15
   432  	MOVB  AH, CL
   433  	SHRQ  $0x20, AX
   434  	TESTQ CX, CX
   435  	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
   436  	ADDQ  CX, BX
   437  	CMPQ  BX, $0x40
   438  	JA    sequenceDecs_decode_56_amd64_ll_update_zero
   439  	CMPQ  CX, $0x40
   440  	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
   441  	NEGQ  CX
   442  	SHRQ  CL, R15
   443  	ADDQ  R15, AX
   444  
   445  sequenceDecs_decode_56_amd64_ll_update_zero:
   446  	MOVQ AX, (R10)
   447  
   448  	// Fill bitreader for state updates
   449  	MOVQ    R14, (SP)
   450  	MOVQ    R9, AX
   451  	SHRQ    $0x08, AX
   452  	MOVBQZX AL, AX
   453  	MOVQ    ctx+16(FP), CX
   454  	CMPQ    96(CX), $0x00
   455  	JZ      sequenceDecs_decode_56_amd64_skip_update
   456  
   457  	// Update Literal Length State
   458  	MOVBQZX DI, R14
   459  	SHRL    $0x10, DI
   460  	LEAQ    (BX)(R14*1), CX
   461  	MOVQ    DX, R15
   462  	MOVQ    CX, BX
   463  	ROLQ    CL, R15
   464  	MOVL    $0x00000001, BP
   465  	MOVB    R14, CL
   466  	SHLL    CL, BP
   467  	DECL    BP
   468  	ANDQ    BP, R15
   469  	ADDQ    R15, DI
   470  
   471  	// Load ctx.llTable
   472  	MOVQ ctx+16(FP), CX
   473  	MOVQ (CX), CX
   474  	MOVQ (CX)(DI*8), DI
   475  
   476  	// Update Match Length State
   477  	MOVBQZX R8, R14
   478  	SHRL    $0x10, R8
   479  	LEAQ    (BX)(R14*1), CX
   480  	MOVQ    DX, R15
   481  	MOVQ    CX, BX
   482  	ROLQ    CL, R15
   483  	MOVL    $0x00000001, BP
   484  	MOVB    R14, CL
   485  	SHLL    CL, BP
   486  	DECL    BP
   487  	ANDQ    BP, R15
   488  	ADDQ    R15, R8
   489  
   490  	// Load ctx.mlTable
   491  	MOVQ ctx+16(FP), CX
   492  	MOVQ 24(CX), CX
   493  	MOVQ (CX)(R8*8), R8
   494  
   495  	// Update Offset State
   496  	MOVBQZX R9, R14
   497  	SHRL    $0x10, R9
   498  	LEAQ    (BX)(R14*1), CX
   499  	MOVQ    DX, R15
   500  	MOVQ    CX, BX
   501  	ROLQ    CL, R15
   502  	MOVL    $0x00000001, BP
   503  	MOVB    R14, CL
   504  	SHLL    CL, BP
   505  	DECL    BP
   506  	ANDQ    BP, R15
   507  	ADDQ    R15, R9
   508  
   509  	// Load ctx.ofTable
   510  	MOVQ ctx+16(FP), CX
   511  	MOVQ 48(CX), CX
   512  	MOVQ (CX)(R9*8), R9
   513  
   514  sequenceDecs_decode_56_amd64_skip_update:
   515  	// Adjust offset
   516  	MOVQ 16(R10), CX
   517  	CMPQ AX, $0x01
   518  	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
   519  	MOVQ R12, R13
   520  	MOVQ R11, R12
   521  	MOVQ CX, R11
   522  	JMP  sequenceDecs_decode_56_amd64_after_adjust
   523  
   524  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
   525  	CMPQ (R10), $0x00000000
   526  	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
   527  	INCQ CX
   528  	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
   529  
   530  sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
   531  	TESTQ CX, CX
   532  	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
   533  	MOVQ  R11, CX
   534  	JMP   sequenceDecs_decode_56_amd64_after_adjust
   535  
   536  sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
   537  	CMPQ CX, $0x01
   538  	JB   sequenceDecs_decode_56_amd64_adjust_zero
   539  	JEQ  sequenceDecs_decode_56_amd64_adjust_one
   540  	CMPQ CX, $0x02
   541  	JA   sequenceDecs_decode_56_amd64_adjust_three
   542  	JMP  sequenceDecs_decode_56_amd64_adjust_two
   543  
   544  sequenceDecs_decode_56_amd64_adjust_zero:
   545  	MOVQ R11, AX
   546  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   547  
   548  sequenceDecs_decode_56_amd64_adjust_one:
   549  	MOVQ R12, AX
   550  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   551  
   552  sequenceDecs_decode_56_amd64_adjust_two:
   553  	MOVQ R13, AX
   554  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   555  
   556  sequenceDecs_decode_56_amd64_adjust_three:
   557  	LEAQ -1(R11), AX
   558  
   559  sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
   560  	TESTQ AX, AX
   561  	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
   562  	MOVQ  $0x00000001, AX
   563  
   564  sequenceDecs_decode_56_amd64_adjust_temp_valid:
   565  	CMPQ    CX, $0x01
   566  	CMOVQNE R12, R13
   567  	MOVQ    R11, R12
   568  	MOVQ    AX, R11
   569  	MOVQ    AX, CX
   570  
   571  sequenceDecs_decode_56_amd64_after_adjust:
   572  	MOVQ CX, 16(R10)
   573  
   574  	// Check values
   575  	MOVQ  8(R10), AX
   576  	MOVQ  (R10), R14
   577  	LEAQ  (AX)(R14*1), R15
   578  	MOVQ  s+0(FP), BP
   579  	ADDQ  R15, 256(BP)
   580  	MOVQ  ctx+16(FP), R15
   581  	SUBQ  R14, 128(R15)
   582  	JS    error_not_enough_literals
   583  	CMPQ  AX, $0x00020002
   584  	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
   585  	TESTQ CX, CX
   586  	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
   587  	TESTQ AX, AX
   588  	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
   589  
   590  sequenceDecs_decode_56_amd64_match_len_ofs_ok:
   591  	ADDQ $0x18, R10
   592  	MOVQ ctx+16(FP), AX
   593  	DECQ 96(AX)
   594  	JNS  sequenceDecs_decode_56_amd64_main_loop
   595  	MOVQ s+0(FP), AX
   596  	MOVQ R11, 144(AX)
   597  	MOVQ R12, 152(AX)
   598  	MOVQ R13, 160(AX)
   599  	MOVQ br+8(FP), AX
   600  	MOVQ DX, 24(AX)
   601  	MOVB BL, 32(AX)
   602  	MOVQ SI, 8(AX)
   603  
   604  	// Return success
   605  	MOVQ $0x00000000, ret+24(FP)
   606  	RET
   607  
   608  	// Return with match length error
   609  sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
   610  	MOVQ $0x00000001, ret+24(FP)
   611  	RET
   612  
   613  	// Return with match too long error
   614  sequenceDecs_decode_56_amd64_error_match_len_too_big:
   615  	MOVQ $0x00000002, ret+24(FP)
   616  	RET
   617  
   618  	// Return with match offset too long error
   619  	MOVQ $0x00000003, ret+24(FP)
   620  	RET
   621  
   622  	// Return with not enough literals error
   623  error_not_enough_literals:
   624  	MOVQ $0x00000004, ret+24(FP)
   625  	RET
   626  
   627  	// Return with overread error
   628  error_overread:
   629  	MOVQ $0x00000006, ret+24(FP)
   630  	RET
   631  
   632  // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   633  // Requires: BMI, BMI2, CMOV
   634  TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
   635  	MOVQ    br+8(FP), BX
   636  	MOVQ    24(BX), AX
   637  	MOVBQZX 32(BX), DX
   638  	MOVQ    (BX), CX
   639  	MOVQ    8(BX), BX
   640  	ADDQ    BX, CX
   641  	MOVQ    CX, (SP)
   642  	MOVQ    ctx+16(FP), CX
   643  	MOVQ    72(CX), SI
   644  	MOVQ    80(CX), DI
   645  	MOVQ    88(CX), R8
   646  	MOVQ    104(CX), R9
   647  	MOVQ    s+0(FP), CX
   648  	MOVQ    144(CX), R10
   649  	MOVQ    152(CX), R11
   650  	MOVQ    160(CX), R12
   651  
   652  sequenceDecs_decode_bmi2_main_loop:
   653  	MOVQ (SP), R13
   654  
   655  	// Fill bitreader to have enough for the offset and match length.
   656  	CMPQ BX, $0x08
   657  	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
   658  	MOVQ DX, CX
   659  	SHRQ $0x03, CX
   660  	SUBQ CX, R13
   661  	MOVQ (R13), AX
   662  	SUBQ CX, BX
   663  	ANDQ $0x07, DX
   664  	JMP  sequenceDecs_decode_bmi2_fill_end
   665  
   666  sequenceDecs_decode_bmi2_fill_byte_by_byte:
   667  	CMPQ    BX, $0x00
   668  	JLE     sequenceDecs_decode_bmi2_fill_check_overread
   669  	CMPQ    DX, $0x07
   670  	JLE     sequenceDecs_decode_bmi2_fill_end
   671  	SHLQ    $0x08, AX
   672  	SUBQ    $0x01, R13
   673  	SUBQ    $0x01, BX
   674  	SUBQ    $0x08, DX
   675  	MOVBQZX (R13), CX
   676  	ORQ     CX, AX
   677  	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
   678  
   679  sequenceDecs_decode_bmi2_fill_check_overread:
   680  	CMPQ DX, $0x40
   681  	JA   error_overread
   682  
   683  sequenceDecs_decode_bmi2_fill_end:
   684  	// Update offset
   685  	MOVQ   $0x00000808, CX
   686  	BEXTRQ CX, R8, R14
   687  	MOVQ   AX, R15
   688  	LEAQ   (DX)(R14*1), CX
   689  	ROLQ   CL, R15
   690  	BZHIQ  R14, R15, R15
   691  	MOVQ   CX, DX
   692  	MOVQ   R8, CX
   693  	SHRQ   $0x20, CX
   694  	ADDQ   R15, CX
   695  	MOVQ   CX, 16(R9)
   696  
   697  	// Update match length
   698  	MOVQ   $0x00000808, CX
   699  	BEXTRQ CX, DI, R14
   700  	MOVQ   AX, R15
   701  	LEAQ   (DX)(R14*1), CX
   702  	ROLQ   CL, R15
   703  	BZHIQ  R14, R15, R15
   704  	MOVQ   CX, DX
   705  	MOVQ   DI, CX
   706  	SHRQ   $0x20, CX
   707  	ADDQ   R15, CX
   708  	MOVQ   CX, 8(R9)
   709  
   710  	// Fill bitreader to have enough for the remaining
   711  	CMPQ BX, $0x08
   712  	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
   713  	MOVQ DX, CX
   714  	SHRQ $0x03, CX
   715  	SUBQ CX, R13
   716  	MOVQ (R13), AX
   717  	SUBQ CX, BX
   718  	ANDQ $0x07, DX
   719  	JMP  sequenceDecs_decode_bmi2_fill_2_end
   720  
   721  sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
   722  	CMPQ    BX, $0x00
   723  	JLE     sequenceDecs_decode_bmi2_fill_2_check_overread
   724  	CMPQ    DX, $0x07
   725  	JLE     sequenceDecs_decode_bmi2_fill_2_end
   726  	SHLQ    $0x08, AX
   727  	SUBQ    $0x01, R13
   728  	SUBQ    $0x01, BX
   729  	SUBQ    $0x08, DX
   730  	MOVBQZX (R13), CX
   731  	ORQ     CX, AX
   732  	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
   733  
   734  sequenceDecs_decode_bmi2_fill_2_check_overread:
   735  	CMPQ DX, $0x40
   736  	JA   error_overread
   737  
   738  sequenceDecs_decode_bmi2_fill_2_end:
   739  	// Update literal length
   740  	MOVQ   $0x00000808, CX
   741  	BEXTRQ CX, SI, R14
   742  	MOVQ   AX, R15
   743  	LEAQ   (DX)(R14*1), CX
   744  	ROLQ   CL, R15
   745  	BZHIQ  R14, R15, R15
   746  	MOVQ   CX, DX
   747  	MOVQ   SI, CX
   748  	SHRQ   $0x20, CX
   749  	ADDQ   R15, CX
   750  	MOVQ   CX, (R9)
   751  
   752  	// Fill bitreader for state updates
   753  	MOVQ    R13, (SP)
   754  	MOVQ    $0x00000808, CX
   755  	BEXTRQ  CX, R8, R13
   756  	MOVQ    ctx+16(FP), CX
   757  	CMPQ    96(CX), $0x00
   758  	JZ      sequenceDecs_decode_bmi2_skip_update
   759  	LEAQ    (SI)(DI*1), R14
   760  	ADDQ    R8, R14
   761  	MOVBQZX R14, R14
   762  	LEAQ    (DX)(R14*1), CX
   763  	MOVQ    AX, R15
   764  	MOVQ    CX, DX
   765  	ROLQ    CL, R15
   766  	BZHIQ   R14, R15, R15
   767  
   768  	// Update Offset State
   769  	BZHIQ R8, R15, CX
   770  	SHRXQ R8, R15, R15
   771  	SHRL  $0x10, R8
   772  	ADDQ  CX, R8
   773  
   774  	// Load ctx.ofTable
   775  	MOVQ ctx+16(FP), CX
   776  	MOVQ 48(CX), CX
   777  	MOVQ (CX)(R8*8), R8
   778  
   779  	// Update Match Length State
   780  	BZHIQ DI, R15, CX
   781  	SHRXQ DI, R15, R15
   782  	SHRL  $0x10, DI
   783  	ADDQ  CX, DI
   784  
   785  	// Load ctx.mlTable
   786  	MOVQ ctx+16(FP), CX
   787  	MOVQ 24(CX), CX
   788  	MOVQ (CX)(DI*8), DI
   789  
   790  	// Update Literal Length State
   791  	BZHIQ SI, R15, CX
   792  	SHRL  $0x10, SI
   793  	ADDQ  CX, SI
   794  
   795  	// Load ctx.llTable
   796  	MOVQ ctx+16(FP), CX
   797  	MOVQ (CX), CX
   798  	MOVQ (CX)(SI*8), SI
   799  
   800  sequenceDecs_decode_bmi2_skip_update:
   801  	// Adjust offset
   802  	MOVQ 16(R9), CX
   803  	CMPQ R13, $0x01
   804  	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
   805  	MOVQ R11, R12
   806  	MOVQ R10, R11
   807  	MOVQ CX, R10
   808  	JMP  sequenceDecs_decode_bmi2_after_adjust
   809  
   810  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
   811  	CMPQ (R9), $0x00000000
   812  	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
   813  	INCQ CX
   814  	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
   815  
   816  sequenceDecs_decode_bmi2_adjust_offset_maybezero:
   817  	TESTQ CX, CX
   818  	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
   819  	MOVQ  R10, CX
   820  	JMP   sequenceDecs_decode_bmi2_after_adjust
   821  
   822  sequenceDecs_decode_bmi2_adjust_offset_nonzero:
   823  	CMPQ CX, $0x01
   824  	JB   sequenceDecs_decode_bmi2_adjust_zero
   825  	JEQ  sequenceDecs_decode_bmi2_adjust_one
   826  	CMPQ CX, $0x02
   827  	JA   sequenceDecs_decode_bmi2_adjust_three
   828  	JMP  sequenceDecs_decode_bmi2_adjust_two
   829  
   830  sequenceDecs_decode_bmi2_adjust_zero:
   831  	MOVQ R10, R13
   832  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   833  
   834  sequenceDecs_decode_bmi2_adjust_one:
   835  	MOVQ R11, R13
   836  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   837  
   838  sequenceDecs_decode_bmi2_adjust_two:
   839  	MOVQ R12, R13
   840  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   841  
   842  sequenceDecs_decode_bmi2_adjust_three:
   843  	LEAQ -1(R10), R13
   844  
   845  sequenceDecs_decode_bmi2_adjust_test_temp_valid:
   846  	TESTQ R13, R13
   847  	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
   848  	MOVQ  $0x00000001, R13
   849  
   850  sequenceDecs_decode_bmi2_adjust_temp_valid:
   851  	CMPQ    CX, $0x01
   852  	CMOVQNE R11, R12
   853  	MOVQ    R10, R11
   854  	MOVQ    R13, R10
   855  	MOVQ    R13, CX
   856  
   857  sequenceDecs_decode_bmi2_after_adjust:
   858  	MOVQ CX, 16(R9)
   859  
   860  	// Check values
   861  	MOVQ  8(R9), R13
   862  	MOVQ  (R9), R14
   863  	LEAQ  (R13)(R14*1), R15
   864  	MOVQ  s+0(FP), BP
   865  	ADDQ  R15, 256(BP)
   866  	MOVQ  ctx+16(FP), R15
   867  	SUBQ  R14, 128(R15)
   868  	JS    error_not_enough_literals
   869  	CMPQ  R13, $0x00020002
   870  	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
   871  	TESTQ CX, CX
   872  	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
   873  	TESTQ R13, R13
   874  	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
   875  
   876  sequenceDecs_decode_bmi2_match_len_ofs_ok:
   877  	ADDQ $0x18, R9
   878  	MOVQ ctx+16(FP), CX
   879  	DECQ 96(CX)
   880  	JNS  sequenceDecs_decode_bmi2_main_loop
   881  	MOVQ s+0(FP), CX
   882  	MOVQ R10, 144(CX)
   883  	MOVQ R11, 152(CX)
   884  	MOVQ R12, 160(CX)
   885  	MOVQ br+8(FP), CX
   886  	MOVQ AX, 24(CX)
   887  	MOVB DL, 32(CX)
   888  	MOVQ BX, 8(CX)
   889  
   890  	// Return success
   891  	MOVQ $0x00000000, ret+24(FP)
   892  	RET
   893  
   894  	// Return with match length error
   895  sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
   896  	MOVQ $0x00000001, ret+24(FP)
   897  	RET
   898  
   899  	// Return with match too long error
   900  sequenceDecs_decode_bmi2_error_match_len_too_big:
   901  	MOVQ $0x00000002, ret+24(FP)
   902  	RET
   903  
   904  	// Return with match offset too long error
   905  	MOVQ $0x00000003, ret+24(FP)
   906  	RET
   907  
   908  	// Return with not enough literals error
   909  error_not_enough_literals:
   910  	MOVQ $0x00000004, ret+24(FP)
   911  	RET
   912  
   913  	// Return with overread error
   914  error_overread:
   915  	MOVQ $0x00000006, ret+24(FP)
   916  	RET
   917  
   918  // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   919  // Requires: BMI, BMI2, CMOV
   920  TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
   921  	MOVQ    br+8(FP), BX
   922  	MOVQ    24(BX), AX
   923  	MOVBQZX 32(BX), DX
   924  	MOVQ    (BX), CX
   925  	MOVQ    8(BX), BX
   926  	ADDQ    BX, CX
   927  	MOVQ    CX, (SP)
   928  	MOVQ    ctx+16(FP), CX
   929  	MOVQ    72(CX), SI
   930  	MOVQ    80(CX), DI
   931  	MOVQ    88(CX), R8
   932  	MOVQ    104(CX), R9
   933  	MOVQ    s+0(FP), CX
   934  	MOVQ    144(CX), R10
   935  	MOVQ    152(CX), R11
   936  	MOVQ    160(CX), R12
   937  
   938  sequenceDecs_decode_56_bmi2_main_loop:
   939  	MOVQ (SP), R13
   940  
   941  	// Fill bitreader to have enough for the offset and match length.
   942  	CMPQ BX, $0x08
   943  	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
   944  	MOVQ DX, CX
   945  	SHRQ $0x03, CX
   946  	SUBQ CX, R13
   947  	MOVQ (R13), AX
   948  	SUBQ CX, BX
   949  	ANDQ $0x07, DX
   950  	JMP  sequenceDecs_decode_56_bmi2_fill_end
   951  
   952  sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
   953  	CMPQ    BX, $0x00
   954  	JLE     sequenceDecs_decode_56_bmi2_fill_check_overread
   955  	CMPQ    DX, $0x07
   956  	JLE     sequenceDecs_decode_56_bmi2_fill_end
   957  	SHLQ    $0x08, AX
   958  	SUBQ    $0x01, R13
   959  	SUBQ    $0x01, BX
   960  	SUBQ    $0x08, DX
   961  	MOVBQZX (R13), CX
   962  	ORQ     CX, AX
   963  	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
   964  
   965  sequenceDecs_decode_56_bmi2_fill_check_overread:
   966  	CMPQ DX, $0x40
   967  	JA   error_overread
   968  
   969  sequenceDecs_decode_56_bmi2_fill_end:
   970  	// Update offset
   971  	MOVQ   $0x00000808, CX
   972  	BEXTRQ CX, R8, R14
   973  	MOVQ   AX, R15
   974  	LEAQ   (DX)(R14*1), CX
   975  	ROLQ   CL, R15
   976  	BZHIQ  R14, R15, R15
   977  	MOVQ   CX, DX
   978  	MOVQ   R8, CX
   979  	SHRQ   $0x20, CX
   980  	ADDQ   R15, CX
   981  	MOVQ   CX, 16(R9)
   982  
   983  	// Update match length
   984  	MOVQ   $0x00000808, CX
   985  	BEXTRQ CX, DI, R14
   986  	MOVQ   AX, R15
   987  	LEAQ   (DX)(R14*1), CX
   988  	ROLQ   CL, R15
   989  	BZHIQ  R14, R15, R15
   990  	MOVQ   CX, DX
   991  	MOVQ   DI, CX
   992  	SHRQ   $0x20, CX
   993  	ADDQ   R15, CX
   994  	MOVQ   CX, 8(R9)
   995  
   996  	// Update literal length
   997  	MOVQ   $0x00000808, CX
   998  	BEXTRQ CX, SI, R14
   999  	MOVQ   AX, R15
  1000  	LEAQ   (DX)(R14*1), CX
  1001  	ROLQ   CL, R15
  1002  	BZHIQ  R14, R15, R15
  1003  	MOVQ   CX, DX
  1004  	MOVQ   SI, CX
  1005  	SHRQ   $0x20, CX
  1006  	ADDQ   R15, CX
  1007  	MOVQ   CX, (R9)
  1008  
  1009  	// Fill bitreader for state updates
  1010  	MOVQ    R13, (SP)
  1011  	MOVQ    $0x00000808, CX
  1012  	BEXTRQ  CX, R8, R13
  1013  	MOVQ    ctx+16(FP), CX
  1014  	CMPQ    96(CX), $0x00
  1015  	JZ      sequenceDecs_decode_56_bmi2_skip_update
  1016  	LEAQ    (SI)(DI*1), R14
  1017  	ADDQ    R8, R14
  1018  	MOVBQZX R14, R14
  1019  	LEAQ    (DX)(R14*1), CX
  1020  	MOVQ    AX, R15
  1021  	MOVQ    CX, DX
  1022  	ROLQ    CL, R15
  1023  	BZHIQ   R14, R15, R15
  1024  
  1025  	// Update Offset State
  1026  	BZHIQ R8, R15, CX
  1027  	SHRXQ R8, R15, R15
  1028  	SHRL  $0x10, R8
  1029  	ADDQ  CX, R8
  1030  
  1031  	// Load ctx.ofTable
  1032  	MOVQ ctx+16(FP), CX
  1033  	MOVQ 48(CX), CX
  1034  	MOVQ (CX)(R8*8), R8
  1035  
  1036  	// Update Match Length State
  1037  	BZHIQ DI, R15, CX
  1038  	SHRXQ DI, R15, R15
  1039  	SHRL  $0x10, DI
  1040  	ADDQ  CX, DI
  1041  
  1042  	// Load ctx.mlTable
  1043  	MOVQ ctx+16(FP), CX
  1044  	MOVQ 24(CX), CX
  1045  	MOVQ (CX)(DI*8), DI
  1046  
  1047  	// Update Literal Length State
  1048  	BZHIQ SI, R15, CX
  1049  	SHRL  $0x10, SI
  1050  	ADDQ  CX, SI
  1051  
  1052  	// Load ctx.llTable
  1053  	MOVQ ctx+16(FP), CX
  1054  	MOVQ (CX), CX
  1055  	MOVQ (CX)(SI*8), SI
  1056  
  1057  sequenceDecs_decode_56_bmi2_skip_update:
  1058  	// Adjust offset
  1059  	MOVQ 16(R9), CX
  1060  	CMPQ R13, $0x01
  1061  	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
  1062  	MOVQ R11, R12
  1063  	MOVQ R10, R11
  1064  	MOVQ CX, R10
  1065  	JMP  sequenceDecs_decode_56_bmi2_after_adjust
  1066  
  1067  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
  1068  	CMPQ (R9), $0x00000000
  1069  	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
  1070  	INCQ CX
  1071  	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  1072  
  1073  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
  1074  	TESTQ CX, CX
  1075  	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  1076  	MOVQ  R10, CX
  1077  	JMP   sequenceDecs_decode_56_bmi2_after_adjust
  1078  
  1079  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
  1080  	CMPQ CX, $0x01
  1081  	JB   sequenceDecs_decode_56_bmi2_adjust_zero
  1082  	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
  1083  	CMPQ CX, $0x02
  1084  	JA   sequenceDecs_decode_56_bmi2_adjust_three
  1085  	JMP  sequenceDecs_decode_56_bmi2_adjust_two
  1086  
  1087  sequenceDecs_decode_56_bmi2_adjust_zero:
  1088  	MOVQ R10, R13
  1089  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1090  
  1091  sequenceDecs_decode_56_bmi2_adjust_one:
  1092  	MOVQ R11, R13
  1093  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1094  
  1095  sequenceDecs_decode_56_bmi2_adjust_two:
  1096  	MOVQ R12, R13
  1097  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1098  
  1099  sequenceDecs_decode_56_bmi2_adjust_three:
  1100  	LEAQ -1(R10), R13
  1101  
  1102  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
  1103  	TESTQ R13, R13
  1104  	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
  1105  	MOVQ  $0x00000001, R13
  1106  
  1107  sequenceDecs_decode_56_bmi2_adjust_temp_valid:
  1108  	CMPQ    CX, $0x01
  1109  	CMOVQNE R11, R12
  1110  	MOVQ    R10, R11
  1111  	MOVQ    R13, R10
  1112  	MOVQ    R13, CX
  1113  
  1114  sequenceDecs_decode_56_bmi2_after_adjust:
  1115  	MOVQ CX, 16(R9)
  1116  
  1117  	// Check values
  1118  	MOVQ  8(R9), R13
  1119  	MOVQ  (R9), R14
  1120  	LEAQ  (R13)(R14*1), R15
  1121  	MOVQ  s+0(FP), BP
  1122  	ADDQ  R15, 256(BP)
  1123  	MOVQ  ctx+16(FP), R15
  1124  	SUBQ  R14, 128(R15)
  1125  	JS    error_not_enough_literals
  1126  	CMPQ  R13, $0x00020002
  1127  	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
  1128  	TESTQ CX, CX
  1129  	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
  1130  	TESTQ R13, R13
  1131  	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
  1132  
  1133  sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
  1134  	ADDQ $0x18, R9
  1135  	MOVQ ctx+16(FP), CX
  1136  	DECQ 96(CX)
  1137  	JNS  sequenceDecs_decode_56_bmi2_main_loop
  1138  	MOVQ s+0(FP), CX
  1139  	MOVQ R10, 144(CX)
  1140  	MOVQ R11, 152(CX)
  1141  	MOVQ R12, 160(CX)
  1142  	MOVQ br+8(FP), CX
  1143  	MOVQ AX, 24(CX)
  1144  	MOVB DL, 32(CX)
  1145  	MOVQ BX, 8(CX)
  1146  
  1147  	// Return success
  1148  	MOVQ $0x00000000, ret+24(FP)
  1149  	RET
  1150  
  1151  	// Return with match length error
  1152  sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
  1153  	MOVQ $0x00000001, ret+24(FP)
  1154  	RET
  1155  
  1156  	// Return with match too long error
  1157  sequenceDecs_decode_56_bmi2_error_match_len_too_big:
  1158  	MOVQ $0x00000002, ret+24(FP)
  1159  	RET
  1160  
  1161  	// Return with match offset too long error
  1162  	MOVQ $0x00000003, ret+24(FP)
  1163  	RET
  1164  
  1165  	// Return with not enough literals error
  1166  error_not_enough_literals:
  1167  	MOVQ $0x00000004, ret+24(FP)
  1168  	RET
  1169  
  1170  	// Return with overread error
  1171  error_overread:
  1172  	MOVQ $0x00000006, ret+24(FP)
  1173  	RET
  1174  
  1175  // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
  1176  // Requires: SSE
  1177  TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
  1178  	MOVQ  ctx+0(FP), R10
  1179  	MOVQ  8(R10), CX
  1180  	TESTQ CX, CX
  1181  	JZ    empty_seqs
  1182  	MOVQ  (R10), AX
  1183  	MOVQ  24(R10), DX
  1184  	MOVQ  32(R10), BX
  1185  	MOVQ  80(R10), SI
  1186  	MOVQ  104(R10), DI
  1187  	MOVQ  120(R10), R8
  1188  	MOVQ  56(R10), R9
  1189  	MOVQ  64(R10), R10
  1190  	ADDQ  R10, R9
  1191  
  1192  	// seqsBase += 24 * seqIndex
  1193  	LEAQ (DX)(DX*2), R11
  1194  	SHLQ $0x03, R11
  1195  	ADDQ R11, AX
  1196  
  1197  	// outBase += outPosition
  1198  	ADDQ DI, BX
  1199  
  1200  main_loop:
  1201  	MOVQ (AX), R11
  1202  	MOVQ 16(AX), R12
  1203  	MOVQ 8(AX), R13
  1204  
  1205  	// Copy literals
  1206  	TESTQ R11, R11
  1207  	JZ    check_offset
  1208  	XORQ  R14, R14
  1209  
  1210  copy_1:
  1211  	MOVUPS (SI)(R14*1), X0
  1212  	MOVUPS X0, (BX)(R14*1)
  1213  	ADDQ   $0x10, R14
  1214  	CMPQ   R14, R11
  1215  	JB     copy_1
  1216  	ADDQ   R11, SI
  1217  	ADDQ   R11, BX
  1218  	ADDQ   R11, DI
  1219  
  1220  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1221  check_offset:
  1222  	LEAQ (DI)(R10*1), R11
  1223  	CMPQ R12, R11
  1224  	JG   error_match_off_too_big
  1225  	CMPQ R12, R8
  1226  	JG   error_match_off_too_big
  1227  
  1228  	// Copy match from history
  1229  	MOVQ R12, R11
  1230  	SUBQ DI, R11
  1231  	JLS  copy_match
  1232  	MOVQ R9, R14
  1233  	SUBQ R11, R14
  1234  	CMPQ R13, R11
  1235  	JG   copy_all_from_history
  1236  	MOVQ R13, R11
  1237  	SUBQ $0x10, R11
  1238  	JB   copy_4_small
  1239  
  1240  copy_4_loop:
  1241  	MOVUPS (R14), X0
  1242  	MOVUPS X0, (BX)
  1243  	ADDQ   $0x10, R14
  1244  	ADDQ   $0x10, BX
  1245  	SUBQ   $0x10, R11
  1246  	JAE    copy_4_loop
  1247  	LEAQ   16(R14)(R11*1), R14
  1248  	LEAQ   16(BX)(R11*1), BX
  1249  	MOVUPS -16(R14), X0
  1250  	MOVUPS X0, -16(BX)
  1251  	JMP    copy_4_end
  1252  
  1253  copy_4_small:
  1254  	CMPQ R13, $0x03
  1255  	JE   copy_4_move_3
  1256  	CMPQ R13, $0x08
  1257  	JB   copy_4_move_4through7
  1258  	JMP  copy_4_move_8through16
  1259  
  1260  copy_4_move_3:
  1261  	MOVW (R14), R11
  1262  	MOVB 2(R14), R12
  1263  	MOVW R11, (BX)
  1264  	MOVB R12, 2(BX)
  1265  	ADDQ R13, R14
  1266  	ADDQ R13, BX
  1267  	JMP  copy_4_end
  1268  
  1269  copy_4_move_4through7:
  1270  	MOVL (R14), R11
  1271  	MOVL -4(R14)(R13*1), R12
  1272  	MOVL R11, (BX)
  1273  	MOVL R12, -4(BX)(R13*1)
  1274  	ADDQ R13, R14
  1275  	ADDQ R13, BX
  1276  	JMP  copy_4_end
  1277  
  1278  copy_4_move_8through16:
  1279  	MOVQ (R14), R11
  1280  	MOVQ -8(R14)(R13*1), R12
  1281  	MOVQ R11, (BX)
  1282  	MOVQ R12, -8(BX)(R13*1)
  1283  	ADDQ R13, R14
  1284  	ADDQ R13, BX
  1285  
  1286  copy_4_end:
  1287  	ADDQ R13, DI
  1288  	ADDQ $0x18, AX
  1289  	INCQ DX
  1290  	CMPQ DX, CX
  1291  	JB   main_loop
  1292  	JMP  loop_finished
  1293  
  1294  copy_all_from_history:
  1295  	MOVQ R11, R15
  1296  	SUBQ $0x10, R15
  1297  	JB   copy_5_small
  1298  
  1299  copy_5_loop:
  1300  	MOVUPS (R14), X0
  1301  	MOVUPS X0, (BX)
  1302  	ADDQ   $0x10, R14
  1303  	ADDQ   $0x10, BX
  1304  	SUBQ   $0x10, R15
  1305  	JAE    copy_5_loop
  1306  	LEAQ   16(R14)(R15*1), R14
  1307  	LEAQ   16(BX)(R15*1), BX
  1308  	MOVUPS -16(R14), X0
  1309  	MOVUPS X0, -16(BX)
  1310  	JMP    copy_5_end
  1311  
  1312  copy_5_small:
  1313  	CMPQ R11, $0x03
  1314  	JE   copy_5_move_3
  1315  	JB   copy_5_move_1or2
  1316  	CMPQ R11, $0x08
  1317  	JB   copy_5_move_4through7
  1318  	JMP  copy_5_move_8through16
  1319  
  1320  copy_5_move_1or2:
  1321  	MOVB (R14), R15
  1322  	MOVB -1(R14)(R11*1), BP
  1323  	MOVB R15, (BX)
  1324  	MOVB BP, -1(BX)(R11*1)
  1325  	ADDQ R11, R14
  1326  	ADDQ R11, BX
  1327  	JMP  copy_5_end
  1328  
  1329  copy_5_move_3:
  1330  	MOVW (R14), R15
  1331  	MOVB 2(R14), BP
  1332  	MOVW R15, (BX)
  1333  	MOVB BP, 2(BX)
  1334  	ADDQ R11, R14
  1335  	ADDQ R11, BX
  1336  	JMP  copy_5_end
  1337  
  1338  copy_5_move_4through7:
  1339  	MOVL (R14), R15
  1340  	MOVL -4(R14)(R11*1), BP
  1341  	MOVL R15, (BX)
  1342  	MOVL BP, -4(BX)(R11*1)
  1343  	ADDQ R11, R14
  1344  	ADDQ R11, BX
  1345  	JMP  copy_5_end
  1346  
  1347  copy_5_move_8through16:
  1348  	MOVQ (R14), R15
  1349  	MOVQ -8(R14)(R11*1), BP
  1350  	MOVQ R15, (BX)
  1351  	MOVQ BP, -8(BX)(R11*1)
  1352  	ADDQ R11, R14
  1353  	ADDQ R11, BX
  1354  
  1355  copy_5_end:
  1356  	ADDQ R11, DI
  1357  	SUBQ R11, R13
  1358  
  1359  	// Copy match from the current buffer
  1360  copy_match:
  1361  	MOVQ BX, R11
  1362  	SUBQ R12, R11
  1363  
  1364  	// ml <= mo
  1365  	CMPQ R13, R12
  1366  	JA   copy_overlapping_match
  1367  
  1368  	// Copy non-overlapping match
  1369  	ADDQ R13, DI
  1370  	MOVQ BX, R12
  1371  	ADDQ R13, BX
  1372  
  1373  copy_2:
  1374  	MOVUPS (R11), X0
  1375  	MOVUPS X0, (R12)
  1376  	ADDQ   $0x10, R11
  1377  	ADDQ   $0x10, R12
  1378  	SUBQ   $0x10, R13
  1379  	JHI    copy_2
  1380  	JMP    handle_loop
  1381  
  1382  	// Copy overlapping match
  1383  copy_overlapping_match:
  1384  	ADDQ R13, DI
  1385  
  1386  copy_slow_3:
  1387  	MOVB (R11), R12
  1388  	MOVB R12, (BX)
  1389  	INCQ R11
  1390  	INCQ BX
  1391  	DECQ R13
  1392  	JNZ  copy_slow_3
  1393  
  1394  handle_loop:
  1395  	ADDQ $0x18, AX
  1396  	INCQ DX
  1397  	CMPQ DX, CX
  1398  	JB   main_loop
  1399  
  1400  loop_finished:
  1401  	// Return value
  1402  	MOVB $0x01, ret+8(FP)
  1403  
  1404  	// Update the context
  1405  	MOVQ ctx+0(FP), AX
  1406  	MOVQ DX, 24(AX)
  1407  	MOVQ DI, 104(AX)
  1408  	SUBQ 80(AX), SI
  1409  	MOVQ SI, 112(AX)
  1410  	RET
  1411  
  1412  error_match_off_too_big:
  1413  	// Return value
  1414  	MOVB $0x00, ret+8(FP)
  1415  
  1416  	// Update the context
  1417  	MOVQ ctx+0(FP), AX
  1418  	MOVQ DX, 24(AX)
  1419  	MOVQ DI, 104(AX)
  1420  	SUBQ 80(AX), SI
  1421  	MOVQ SI, 112(AX)
  1422  	RET
  1423  
  1424  empty_seqs:
  1425  	// Return value
  1426  	MOVB $0x01, ret+8(FP)
  1427  	RET
  1428  
  1429  // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
  1430  // Requires: SSE
  1431  TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
  1432  	MOVQ  ctx+0(FP), R10
  1433  	MOVQ  8(R10), CX
  1434  	TESTQ CX, CX
  1435  	JZ    empty_seqs
  1436  	MOVQ  (R10), AX
  1437  	MOVQ  24(R10), DX
  1438  	MOVQ  32(R10), BX
  1439  	MOVQ  80(R10), SI
  1440  	MOVQ  104(R10), DI
  1441  	MOVQ  120(R10), R8
  1442  	MOVQ  56(R10), R9
  1443  	MOVQ  64(R10), R10
  1444  	ADDQ  R10, R9
  1445  
  1446  	// seqsBase += 24 * seqIndex
  1447  	LEAQ (DX)(DX*2), R11
  1448  	SHLQ $0x03, R11
  1449  	ADDQ R11, AX
  1450  
  1451  	// outBase += outPosition
  1452  	ADDQ DI, BX
  1453  
  1454  main_loop:
  1455  	MOVQ (AX), R11
  1456  	MOVQ 16(AX), R12
  1457  	MOVQ 8(AX), R13
  1458  
  1459  	// Copy literals
  1460  	TESTQ R11, R11
  1461  	JZ    check_offset
  1462  	MOVQ  R11, R14
  1463  	SUBQ  $0x10, R14
  1464  	JB    copy_1_small
  1465  
  1466  copy_1_loop:
  1467  	MOVUPS (SI), X0
  1468  	MOVUPS X0, (BX)
  1469  	ADDQ   $0x10, SI
  1470  	ADDQ   $0x10, BX
  1471  	SUBQ   $0x10, R14
  1472  	JAE    copy_1_loop
  1473  	LEAQ   16(SI)(R14*1), SI
  1474  	LEAQ   16(BX)(R14*1), BX
  1475  	MOVUPS -16(SI), X0
  1476  	MOVUPS X0, -16(BX)
  1477  	JMP    copy_1_end
  1478  
  1479  copy_1_small:
  1480  	CMPQ R11, $0x03
  1481  	JE   copy_1_move_3
  1482  	JB   copy_1_move_1or2
  1483  	CMPQ R11, $0x08
  1484  	JB   copy_1_move_4through7
  1485  	JMP  copy_1_move_8through16
  1486  
  1487  copy_1_move_1or2:
  1488  	MOVB (SI), R14
  1489  	MOVB -1(SI)(R11*1), R15
  1490  	MOVB R14, (BX)
  1491  	MOVB R15, -1(BX)(R11*1)
  1492  	ADDQ R11, SI
  1493  	ADDQ R11, BX
  1494  	JMP  copy_1_end
  1495  
  1496  copy_1_move_3:
  1497  	MOVW (SI), R14
  1498  	MOVB 2(SI), R15
  1499  	MOVW R14, (BX)
  1500  	MOVB R15, 2(BX)
  1501  	ADDQ R11, SI
  1502  	ADDQ R11, BX
  1503  	JMP  copy_1_end
  1504  
  1505  copy_1_move_4through7:
  1506  	MOVL (SI), R14
  1507  	MOVL -4(SI)(R11*1), R15
  1508  	MOVL R14, (BX)
  1509  	MOVL R15, -4(BX)(R11*1)
  1510  	ADDQ R11, SI
  1511  	ADDQ R11, BX
  1512  	JMP  copy_1_end
  1513  
  1514  copy_1_move_8through16:
  1515  	MOVQ (SI), R14
  1516  	MOVQ -8(SI)(R11*1), R15
  1517  	MOVQ R14, (BX)
  1518  	MOVQ R15, -8(BX)(R11*1)
  1519  	ADDQ R11, SI
  1520  	ADDQ R11, BX
  1521  
  1522  copy_1_end:
  1523  	ADDQ R11, DI
  1524  
  1525  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1526  check_offset:
  1527  	LEAQ (DI)(R10*1), R11
  1528  	CMPQ R12, R11
  1529  	JG   error_match_off_too_big
  1530  	CMPQ R12, R8
  1531  	JG   error_match_off_too_big
  1532  
  1533  	// Copy match from history
  1534  	MOVQ R12, R11
  1535  	SUBQ DI, R11
  1536  	JLS  copy_match
  1537  	MOVQ R9, R14
  1538  	SUBQ R11, R14
  1539  	CMPQ R13, R11
  1540  	JG   copy_all_from_history
  1541  	MOVQ R13, R11
  1542  	SUBQ $0x10, R11
  1543  	JB   copy_4_small
  1544  
  1545  copy_4_loop:
  1546  	MOVUPS (R14), X0
  1547  	MOVUPS X0, (BX)
  1548  	ADDQ   $0x10, R14
  1549  	ADDQ   $0x10, BX
  1550  	SUBQ   $0x10, R11
  1551  	JAE    copy_4_loop
  1552  	LEAQ   16(R14)(R11*1), R14
  1553  	LEAQ   16(BX)(R11*1), BX
  1554  	MOVUPS -16(R14), X0
  1555  	MOVUPS X0, -16(BX)
  1556  	JMP    copy_4_end
  1557  
  1558  copy_4_small:
  1559  	CMPQ R13, $0x03
  1560  	JE   copy_4_move_3
  1561  	CMPQ R13, $0x08
  1562  	JB   copy_4_move_4through7
  1563  	JMP  copy_4_move_8through16
  1564  
  1565  copy_4_move_3:
  1566  	MOVW (R14), R11
  1567  	MOVB 2(R14), R12
  1568  	MOVW R11, (BX)
  1569  	MOVB R12, 2(BX)
  1570  	ADDQ R13, R14
  1571  	ADDQ R13, BX
  1572  	JMP  copy_4_end
  1573  
  1574  copy_4_move_4through7:
  1575  	MOVL (R14), R11
  1576  	MOVL -4(R14)(R13*1), R12
  1577  	MOVL R11, (BX)
  1578  	MOVL R12, -4(BX)(R13*1)
  1579  	ADDQ R13, R14
  1580  	ADDQ R13, BX
  1581  	JMP  copy_4_end
  1582  
  1583  copy_4_move_8through16:
  1584  	MOVQ (R14), R11
  1585  	MOVQ -8(R14)(R13*1), R12
  1586  	MOVQ R11, (BX)
  1587  	MOVQ R12, -8(BX)(R13*1)
  1588  	ADDQ R13, R14
  1589  	ADDQ R13, BX
  1590  
  1591  copy_4_end:
  1592  	ADDQ R13, DI
  1593  	ADDQ $0x18, AX
  1594  	INCQ DX
  1595  	CMPQ DX, CX
  1596  	JB   main_loop
  1597  	JMP  loop_finished
  1598  
  1599  copy_all_from_history:
  1600  	MOVQ R11, R15
  1601  	SUBQ $0x10, R15
  1602  	JB   copy_5_small
  1603  
  1604  copy_5_loop:
  1605  	MOVUPS (R14), X0
  1606  	MOVUPS X0, (BX)
  1607  	ADDQ   $0x10, R14
  1608  	ADDQ   $0x10, BX
  1609  	SUBQ   $0x10, R15
  1610  	JAE    copy_5_loop
  1611  	LEAQ   16(R14)(R15*1), R14
  1612  	LEAQ   16(BX)(R15*1), BX
  1613  	MOVUPS -16(R14), X0
  1614  	MOVUPS X0, -16(BX)
  1615  	JMP    copy_5_end
  1616  
  1617  copy_5_small:
  1618  	CMPQ R11, $0x03
  1619  	JE   copy_5_move_3
  1620  	JB   copy_5_move_1or2
  1621  	CMPQ R11, $0x08
  1622  	JB   copy_5_move_4through7
  1623  	JMP  copy_5_move_8through16
  1624  
  1625  copy_5_move_1or2:
  1626  	MOVB (R14), R15
  1627  	MOVB -1(R14)(R11*1), BP
  1628  	MOVB R15, (BX)
  1629  	MOVB BP, -1(BX)(R11*1)
  1630  	ADDQ R11, R14
  1631  	ADDQ R11, BX
  1632  	JMP  copy_5_end
  1633  
  1634  copy_5_move_3:
  1635  	MOVW (R14), R15
  1636  	MOVB 2(R14), BP
  1637  	MOVW R15, (BX)
  1638  	MOVB BP, 2(BX)
  1639  	ADDQ R11, R14
  1640  	ADDQ R11, BX
  1641  	JMP  copy_5_end
  1642  
  1643  copy_5_move_4through7:
  1644  	MOVL (R14), R15
  1645  	MOVL -4(R14)(R11*1), BP
  1646  	MOVL R15, (BX)
  1647  	MOVL BP, -4(BX)(R11*1)
  1648  	ADDQ R11, R14
  1649  	ADDQ R11, BX
  1650  	JMP  copy_5_end
  1651  
  1652  copy_5_move_8through16:
  1653  	MOVQ (R14), R15
  1654  	MOVQ -8(R14)(R11*1), BP
  1655  	MOVQ R15, (BX)
  1656  	MOVQ BP, -8(BX)(R11*1)
  1657  	ADDQ R11, R14
  1658  	ADDQ R11, BX
  1659  
  1660  copy_5_end:
  1661  	ADDQ R11, DI
  1662  	SUBQ R11, R13
  1663  
  1664  	// Copy match from the current buffer
  1665  copy_match:
  1666  	MOVQ BX, R11
  1667  	SUBQ R12, R11
  1668  
  1669  	// ml <= mo
  1670  	CMPQ R13, R12
  1671  	JA   copy_overlapping_match
  1672  
  1673  	// Copy non-overlapping match
  1674  	ADDQ R13, DI
  1675  	MOVQ R13, R12
  1676  	SUBQ $0x10, R12
  1677  	JB   copy_2_small
  1678  
  1679  copy_2_loop:
  1680  	MOVUPS (R11), X0
  1681  	MOVUPS X0, (BX)
  1682  	ADDQ   $0x10, R11
  1683  	ADDQ   $0x10, BX
  1684  	SUBQ   $0x10, R12
  1685  	JAE    copy_2_loop
  1686  	LEAQ   16(R11)(R12*1), R11
  1687  	LEAQ   16(BX)(R12*1), BX
  1688  	MOVUPS -16(R11), X0
  1689  	MOVUPS X0, -16(BX)
  1690  	JMP    copy_2_end
  1691  
  1692  copy_2_small:
  1693  	CMPQ R13, $0x03
  1694  	JE   copy_2_move_3
  1695  	JB   copy_2_move_1or2
  1696  	CMPQ R13, $0x08
  1697  	JB   copy_2_move_4through7
  1698  	JMP  copy_2_move_8through16
  1699  
  1700  copy_2_move_1or2:
  1701  	MOVB (R11), R12
  1702  	MOVB -1(R11)(R13*1), R14
  1703  	MOVB R12, (BX)
  1704  	MOVB R14, -1(BX)(R13*1)
  1705  	ADDQ R13, R11
  1706  	ADDQ R13, BX
  1707  	JMP  copy_2_end
  1708  
  1709  copy_2_move_3:
  1710  	MOVW (R11), R12
  1711  	MOVB 2(R11), R14
  1712  	MOVW R12, (BX)
  1713  	MOVB R14, 2(BX)
  1714  	ADDQ R13, R11
  1715  	ADDQ R13, BX
  1716  	JMP  copy_2_end
  1717  
  1718  copy_2_move_4through7:
  1719  	MOVL (R11), R12
  1720  	MOVL -4(R11)(R13*1), R14
  1721  	MOVL R12, (BX)
  1722  	MOVL R14, -4(BX)(R13*1)
  1723  	ADDQ R13, R11
  1724  	ADDQ R13, BX
  1725  	JMP  copy_2_end
  1726  
  1727  copy_2_move_8through16:
  1728  	MOVQ (R11), R12
  1729  	MOVQ -8(R11)(R13*1), R14
  1730  	MOVQ R12, (BX)
  1731  	MOVQ R14, -8(BX)(R13*1)
  1732  	ADDQ R13, R11
  1733  	ADDQ R13, BX
  1734  
  1735  copy_2_end:
  1736  	JMP handle_loop
  1737  
  1738  	// Copy overlapping match
  1739  copy_overlapping_match:
  1740  	ADDQ R13, DI
  1741  
  1742  copy_slow_3:
  1743  	MOVB (R11), R12
  1744  	MOVB R12, (BX)
  1745  	INCQ R11
  1746  	INCQ BX
  1747  	DECQ R13
  1748  	JNZ  copy_slow_3
  1749  
  1750  handle_loop:
  1751  	ADDQ $0x18, AX
  1752  	INCQ DX
  1753  	CMPQ DX, CX
  1754  	JB   main_loop
  1755  
  1756  loop_finished:
  1757  	// Return value
  1758  	MOVB $0x01, ret+8(FP)
  1759  
  1760  	// Update the context
  1761  	MOVQ ctx+0(FP), AX
  1762  	MOVQ DX, 24(AX)
  1763  	MOVQ DI, 104(AX)
  1764  	SUBQ 80(AX), SI
  1765  	MOVQ SI, 112(AX)
  1766  	RET
  1767  
  1768  error_match_off_too_big:
  1769  	// Return value
  1770  	MOVB $0x00, ret+8(FP)
  1771  
  1772  	// Update the context
  1773  	MOVQ ctx+0(FP), AX
  1774  	MOVQ DX, 24(AX)
  1775  	MOVQ DI, 104(AX)
  1776  	SUBQ 80(AX), SI
  1777  	MOVQ SI, 112(AX)
  1778  	RET
  1779  
  1780  empty_seqs:
  1781  	// Return value
  1782  	MOVB $0x01, ret+8(FP)
  1783  	RET
  1784  
  1785  // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  1786  // Requires: CMOV, SSE
  1787  TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
  1788  	MOVQ    br+8(FP), CX
  1789  	MOVQ    24(CX), DX
  1790  	MOVBQZX 32(CX), BX
  1791  	MOVQ    (CX), AX
  1792  	MOVQ    8(CX), SI
  1793  	ADDQ    SI, AX
  1794  	MOVQ    AX, (SP)
  1795  	MOVQ    ctx+16(FP), AX
  1796  	MOVQ    72(AX), DI
  1797  	MOVQ    80(AX), R8
  1798  	MOVQ    88(AX), R9
  1799  	XORQ    CX, CX
  1800  	MOVQ    CX, 8(SP)
  1801  	MOVQ    CX, 16(SP)
  1802  	MOVQ    CX, 24(SP)
  1803  	MOVQ    112(AX), R10
  1804  	MOVQ    128(AX), CX
  1805  	MOVQ    CX, 32(SP)
  1806  	MOVQ    144(AX), R11
  1807  	MOVQ    136(AX), R12
  1808  	MOVQ    200(AX), CX
  1809  	MOVQ    CX, 56(SP)
  1810  	MOVQ    176(AX), CX
  1811  	MOVQ    CX, 48(SP)
  1812  	MOVQ    184(AX), AX
  1813  	MOVQ    AX, 40(SP)
  1814  	MOVQ    40(SP), AX
  1815  	ADDQ    AX, 48(SP)
  1816  
  1817  	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  1818  	ADDQ R10, 32(SP)
  1819  
  1820  	// outBase += outPosition
  1821  	ADDQ R12, R10
  1822  
  1823  sequenceDecs_decodeSync_amd64_main_loop:
  1824  	MOVQ (SP), R13
  1825  
  1826  	// Fill bitreader to have enough for the offset and match length.
  1827  	CMPQ SI, $0x08
  1828  	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1829  	MOVQ BX, AX
  1830  	SHRQ $0x03, AX
  1831  	SUBQ AX, R13
  1832  	MOVQ (R13), DX
  1833  	SUBQ AX, SI
  1834  	ANDQ $0x07, BX
  1835  	JMP  sequenceDecs_decodeSync_amd64_fill_end
  1836  
  1837  sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
  1838  	CMPQ    SI, $0x00
  1839  	JLE     sequenceDecs_decodeSync_amd64_fill_check_overread
  1840  	CMPQ    BX, $0x07
  1841  	JLE     sequenceDecs_decodeSync_amd64_fill_end
  1842  	SHLQ    $0x08, DX
  1843  	SUBQ    $0x01, R13
  1844  	SUBQ    $0x01, SI
  1845  	SUBQ    $0x08, BX
  1846  	MOVBQZX (R13), AX
  1847  	ORQ     AX, DX
  1848  	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1849  
  1850  sequenceDecs_decodeSync_amd64_fill_check_overread:
  1851  	CMPQ BX, $0x40
  1852  	JA   error_overread
  1853  
  1854  sequenceDecs_decodeSync_amd64_fill_end:
  1855  	// Update offset
  1856  	MOVQ  R9, AX
  1857  	MOVQ  BX, CX
  1858  	MOVQ  DX, R14
  1859  	SHLQ  CL, R14
  1860  	MOVB  AH, CL
  1861  	SHRQ  $0x20, AX
  1862  	TESTQ CX, CX
  1863  	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
  1864  	ADDQ  CX, BX
  1865  	CMPQ  BX, $0x40
  1866  	JA    sequenceDecs_decodeSync_amd64_of_update_zero
  1867  	CMPQ  CX, $0x40
  1868  	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
  1869  	NEGQ  CX
  1870  	SHRQ  CL, R14
  1871  	ADDQ  R14, AX
  1872  
  1873  sequenceDecs_decodeSync_amd64_of_update_zero:
  1874  	MOVQ AX, 8(SP)
  1875  
  1876  	// Update match length
  1877  	MOVQ  R8, AX
  1878  	MOVQ  BX, CX
  1879  	MOVQ  DX, R14
  1880  	SHLQ  CL, R14
  1881  	MOVB  AH, CL
  1882  	SHRQ  $0x20, AX
  1883  	TESTQ CX, CX
  1884  	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
  1885  	ADDQ  CX, BX
  1886  	CMPQ  BX, $0x40
  1887  	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
  1888  	CMPQ  CX, $0x40
  1889  	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
  1890  	NEGQ  CX
  1891  	SHRQ  CL, R14
  1892  	ADDQ  R14, AX
  1893  
  1894  sequenceDecs_decodeSync_amd64_ml_update_zero:
  1895  	MOVQ AX, 16(SP)
  1896  
  1897  	// Fill bitreader to have enough for the remaining
  1898  	CMPQ SI, $0x08
  1899  	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1900  	MOVQ BX, AX
  1901  	SHRQ $0x03, AX
  1902  	SUBQ AX, R13
  1903  	MOVQ (R13), DX
  1904  	SUBQ AX, SI
  1905  	ANDQ $0x07, BX
  1906  	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
  1907  
  1908  sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
  1909  	CMPQ    SI, $0x00
  1910  	JLE     sequenceDecs_decodeSync_amd64_fill_2_check_overread
  1911  	CMPQ    BX, $0x07
  1912  	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
  1913  	SHLQ    $0x08, DX
  1914  	SUBQ    $0x01, R13
  1915  	SUBQ    $0x01, SI
  1916  	SUBQ    $0x08, BX
  1917  	MOVBQZX (R13), AX
  1918  	ORQ     AX, DX
  1919  	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1920  
  1921  sequenceDecs_decodeSync_amd64_fill_2_check_overread:
  1922  	CMPQ BX, $0x40
  1923  	JA   error_overread
  1924  
  1925  sequenceDecs_decodeSync_amd64_fill_2_end:
  1926  	// Update literal length
  1927  	MOVQ  DI, AX
  1928  	MOVQ  BX, CX
  1929  	MOVQ  DX, R14
  1930  	SHLQ  CL, R14
  1931  	MOVB  AH, CL
  1932  	SHRQ  $0x20, AX
  1933  	TESTQ CX, CX
  1934  	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
  1935  	ADDQ  CX, BX
  1936  	CMPQ  BX, $0x40
  1937  	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
  1938  	CMPQ  CX, $0x40
  1939  	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
  1940  	NEGQ  CX
  1941  	SHRQ  CL, R14
  1942  	ADDQ  R14, AX
  1943  
  1944  sequenceDecs_decodeSync_amd64_ll_update_zero:
  1945  	MOVQ AX, 24(SP)
  1946  
  1947  	// Fill bitreader for state updates
  1948  	MOVQ    R13, (SP)
  1949  	MOVQ    R9, AX
  1950  	SHRQ    $0x08, AX
  1951  	MOVBQZX AL, AX
  1952  	MOVQ    ctx+16(FP), CX
  1953  	CMPQ    96(CX), $0x00
  1954  	JZ      sequenceDecs_decodeSync_amd64_skip_update
  1955  
  1956  	// Update Literal Length State
  1957  	MOVBQZX DI, R13
  1958  	SHRL    $0x10, DI
  1959  	LEAQ    (BX)(R13*1), CX
  1960  	MOVQ    DX, R14
  1961  	MOVQ    CX, BX
  1962  	ROLQ    CL, R14
  1963  	MOVL    $0x00000001, R15
  1964  	MOVB    R13, CL
  1965  	SHLL    CL, R15
  1966  	DECL    R15
  1967  	ANDQ    R15, R14
  1968  	ADDQ    R14, DI
  1969  
  1970  	// Load ctx.llTable
  1971  	MOVQ ctx+16(FP), CX
  1972  	MOVQ (CX), CX
  1973  	MOVQ (CX)(DI*8), DI
  1974  
  1975  	// Update Match Length State
  1976  	MOVBQZX R8, R13
  1977  	SHRL    $0x10, R8
  1978  	LEAQ    (BX)(R13*1), CX
  1979  	MOVQ    DX, R14
  1980  	MOVQ    CX, BX
  1981  	ROLQ    CL, R14
  1982  	MOVL    $0x00000001, R15
  1983  	MOVB    R13, CL
  1984  	SHLL    CL, R15
  1985  	DECL    R15
  1986  	ANDQ    R15, R14
  1987  	ADDQ    R14, R8
  1988  
  1989  	// Load ctx.mlTable
  1990  	MOVQ ctx+16(FP), CX
  1991  	MOVQ 24(CX), CX
  1992  	MOVQ (CX)(R8*8), R8
  1993  
  1994  	// Update Offset State
  1995  	MOVBQZX R9, R13
  1996  	SHRL    $0x10, R9
  1997  	LEAQ    (BX)(R13*1), CX
  1998  	MOVQ    DX, R14
  1999  	MOVQ    CX, BX
  2000  	ROLQ    CL, R14
  2001  	MOVL    $0x00000001, R15
  2002  	MOVB    R13, CL
  2003  	SHLL    CL, R15
  2004  	DECL    R15
  2005  	ANDQ    R15, R14
  2006  	ADDQ    R14, R9
  2007  
  2008  	// Load ctx.ofTable
  2009  	MOVQ ctx+16(FP), CX
  2010  	MOVQ 48(CX), CX
  2011  	MOVQ (CX)(R9*8), R9
  2012  
  2013  sequenceDecs_decodeSync_amd64_skip_update:
  2014  	// Adjust offset
  2015  	MOVQ   s+0(FP), CX
  2016  	MOVQ   8(SP), R13
  2017  	CMPQ   AX, $0x01
  2018  	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
  2019  	MOVUPS 144(CX), X0
  2020  	MOVQ   R13, 144(CX)
  2021  	MOVUPS X0, 152(CX)
  2022  	JMP    sequenceDecs_decodeSync_amd64_after_adjust
  2023  
  2024  sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
  2025  	CMPQ 24(SP), $0x00000000
  2026  	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
  2027  	INCQ R13
  2028  	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  2029  
  2030  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
  2031  	TESTQ R13, R13
  2032  	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  2033  	MOVQ  144(CX), R13
  2034  	JMP   sequenceDecs_decodeSync_amd64_after_adjust
  2035  
  2036  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
  2037  	MOVQ    R13, AX
  2038  	XORQ    R14, R14
  2039  	MOVQ    $-1, R15
  2040  	CMPQ    R13, $0x03
  2041  	CMOVQEQ R14, AX
  2042  	CMOVQEQ R15, R14
  2043  	ADDQ    144(CX)(AX*8), R14
  2044  	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
  2045  	MOVQ    $0x00000001, R14
  2046  
  2047  sequenceDecs_decodeSync_amd64_adjust_temp_valid:
  2048  	CMPQ R13, $0x01
  2049  	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
  2050  	MOVQ 152(CX), AX
  2051  	MOVQ AX, 160(CX)
  2052  
  2053  sequenceDecs_decodeSync_amd64_adjust_skip:
  2054  	MOVQ 144(CX), AX
  2055  	MOVQ AX, 152(CX)
  2056  	MOVQ R14, 144(CX)
  2057  	MOVQ R14, R13
  2058  
  2059  sequenceDecs_decodeSync_amd64_after_adjust:
  2060  	MOVQ R13, 8(SP)
  2061  
  2062  	// Check values
  2063  	MOVQ  16(SP), AX
  2064  	MOVQ  24(SP), CX
  2065  	LEAQ  (AX)(CX*1), R14
  2066  	MOVQ  s+0(FP), R15
  2067  	ADDQ  R14, 256(R15)
  2068  	MOVQ  ctx+16(FP), R14
  2069  	SUBQ  CX, 104(R14)
  2070  	JS    error_not_enough_literals
  2071  	CMPQ  AX, $0x00020002
  2072  	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
  2073  	TESTQ R13, R13
  2074  	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
  2075  	TESTQ AX, AX
  2076  	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
  2077  
  2078  sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
  2079  	MOVQ 24(SP), AX
  2080  	MOVQ 8(SP), CX
  2081  	MOVQ 16(SP), R13
  2082  
  2083  	// Check if we have enough space in s.out
  2084  	LEAQ (AX)(R13*1), R14
  2085  	ADDQ R10, R14
  2086  	CMPQ R14, 32(SP)
  2087  	JA   error_not_enough_space
  2088  
  2089  	// Copy literals
  2090  	TESTQ AX, AX
  2091  	JZ    check_offset
  2092  	XORQ  R14, R14
  2093  
  2094  copy_1:
  2095  	MOVUPS (R11)(R14*1), X0
  2096  	MOVUPS X0, (R10)(R14*1)
  2097  	ADDQ   $0x10, R14
  2098  	CMPQ   R14, AX
  2099  	JB     copy_1
  2100  	ADDQ   AX, R11
  2101  	ADDQ   AX, R10
  2102  	ADDQ   AX, R12
  2103  
  2104  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2105  check_offset:
  2106  	MOVQ R12, AX
  2107  	ADDQ 40(SP), AX
  2108  	CMPQ CX, AX
  2109  	JG   error_match_off_too_big
  2110  	CMPQ CX, 56(SP)
  2111  	JG   error_match_off_too_big
  2112  
  2113  	// Copy match from history
  2114  	MOVQ CX, AX
  2115  	SUBQ R12, AX
  2116  	JLS  copy_match
  2117  	MOVQ 48(SP), R14
  2118  	SUBQ AX, R14
  2119  	CMPQ R13, AX
  2120  	JG   copy_all_from_history
  2121  	MOVQ R13, AX
  2122  	SUBQ $0x10, AX
  2123  	JB   copy_4_small
  2124  
  2125  copy_4_loop:
  2126  	MOVUPS (R14), X0
  2127  	MOVUPS X0, (R10)
  2128  	ADDQ   $0x10, R14
  2129  	ADDQ   $0x10, R10
  2130  	SUBQ   $0x10, AX
  2131  	JAE    copy_4_loop
  2132  	LEAQ   16(R14)(AX*1), R14
  2133  	LEAQ   16(R10)(AX*1), R10
  2134  	MOVUPS -16(R14), X0
  2135  	MOVUPS X0, -16(R10)
  2136  	JMP    copy_4_end
  2137  
  2138  copy_4_small:
  2139  	CMPQ R13, $0x03
  2140  	JE   copy_4_move_3
  2141  	CMPQ R13, $0x08
  2142  	JB   copy_4_move_4through7
  2143  	JMP  copy_4_move_8through16
  2144  
  2145  copy_4_move_3:
  2146  	MOVW (R14), AX
  2147  	MOVB 2(R14), CL
  2148  	MOVW AX, (R10)
  2149  	MOVB CL, 2(R10)
  2150  	ADDQ R13, R14
  2151  	ADDQ R13, R10
  2152  	JMP  copy_4_end
  2153  
  2154  copy_4_move_4through7:
  2155  	MOVL (R14), AX
  2156  	MOVL -4(R14)(R13*1), CX
  2157  	MOVL AX, (R10)
  2158  	MOVL CX, -4(R10)(R13*1)
  2159  	ADDQ R13, R14
  2160  	ADDQ R13, R10
  2161  	JMP  copy_4_end
  2162  
  2163  copy_4_move_8through16:
  2164  	MOVQ (R14), AX
  2165  	MOVQ -8(R14)(R13*1), CX
  2166  	MOVQ AX, (R10)
  2167  	MOVQ CX, -8(R10)(R13*1)
  2168  	ADDQ R13, R14
  2169  	ADDQ R13, R10
  2170  
  2171  copy_4_end:
  2172  	ADDQ R13, R12
  2173  	JMP  handle_loop
  2174  	JMP loop_finished
  2175  
  2176  copy_all_from_history:
  2177  	MOVQ AX, R15
  2178  	SUBQ $0x10, R15
  2179  	JB   copy_5_small
  2180  
  2181  copy_5_loop:
  2182  	MOVUPS (R14), X0
  2183  	MOVUPS X0, (R10)
  2184  	ADDQ   $0x10, R14
  2185  	ADDQ   $0x10, R10
  2186  	SUBQ   $0x10, R15
  2187  	JAE    copy_5_loop
  2188  	LEAQ   16(R14)(R15*1), R14
  2189  	LEAQ   16(R10)(R15*1), R10
  2190  	MOVUPS -16(R14), X0
  2191  	MOVUPS X0, -16(R10)
  2192  	JMP    copy_5_end
  2193  
  2194  copy_5_small:
  2195  	CMPQ AX, $0x03
  2196  	JE   copy_5_move_3
  2197  	JB   copy_5_move_1or2
  2198  	CMPQ AX, $0x08
  2199  	JB   copy_5_move_4through7
  2200  	JMP  copy_5_move_8through16
  2201  
  2202  copy_5_move_1or2:
  2203  	MOVB (R14), R15
  2204  	MOVB -1(R14)(AX*1), BP
  2205  	MOVB R15, (R10)
  2206  	MOVB BP, -1(R10)(AX*1)
  2207  	ADDQ AX, R14
  2208  	ADDQ AX, R10
  2209  	JMP  copy_5_end
  2210  
  2211  copy_5_move_3:
  2212  	MOVW (R14), R15
  2213  	MOVB 2(R14), BP
  2214  	MOVW R15, (R10)
  2215  	MOVB BP, 2(R10)
  2216  	ADDQ AX, R14
  2217  	ADDQ AX, R10
  2218  	JMP  copy_5_end
  2219  
  2220  copy_5_move_4through7:
  2221  	MOVL (R14), R15
  2222  	MOVL -4(R14)(AX*1), BP
  2223  	MOVL R15, (R10)
  2224  	MOVL BP, -4(R10)(AX*1)
  2225  	ADDQ AX, R14
  2226  	ADDQ AX, R10
  2227  	JMP  copy_5_end
  2228  
  2229  copy_5_move_8through16:
  2230  	MOVQ (R14), R15
  2231  	MOVQ -8(R14)(AX*1), BP
  2232  	MOVQ R15, (R10)
  2233  	MOVQ BP, -8(R10)(AX*1)
  2234  	ADDQ AX, R14
  2235  	ADDQ AX, R10
  2236  
  2237  copy_5_end:
  2238  	ADDQ AX, R12
  2239  	SUBQ AX, R13
  2240  
  2241  	// Copy match from the current buffer
  2242  copy_match:
  2243  	MOVQ R10, AX
  2244  	SUBQ CX, AX
  2245  
  2246  	// ml <= mo
  2247  	CMPQ R13, CX
  2248  	JA   copy_overlapping_match
  2249  
  2250  	// Copy non-overlapping match
  2251  	ADDQ R13, R12
  2252  	MOVQ R10, CX
  2253  	ADDQ R13, R10
  2254  
  2255  copy_2:
  2256  	MOVUPS (AX), X0
  2257  	MOVUPS X0, (CX)
  2258  	ADDQ   $0x10, AX
  2259  	ADDQ   $0x10, CX
  2260  	SUBQ   $0x10, R13
  2261  	JHI    copy_2
  2262  	JMP    handle_loop
  2263  
  2264  	// Copy overlapping match
  2265  copy_overlapping_match:
  2266  	ADDQ R13, R12
  2267  
  2268  copy_slow_3:
  2269  	MOVB (AX), CL
  2270  	MOVB CL, (R10)
  2271  	INCQ AX
  2272  	INCQ R10
  2273  	DECQ R13
  2274  	JNZ  copy_slow_3
  2275  
  2276  handle_loop:
  2277  	MOVQ ctx+16(FP), AX
  2278  	DECQ 96(AX)
  2279  	JNS  sequenceDecs_decodeSync_amd64_main_loop
  2280  
  2281  loop_finished:
  2282  	MOVQ br+8(FP), AX
  2283  	MOVQ DX, 24(AX)
  2284  	MOVB BL, 32(AX)
  2285  	MOVQ SI, 8(AX)
  2286  
  2287  	// Update the context
  2288  	MOVQ ctx+16(FP), AX
  2289  	MOVQ R12, 136(AX)
  2290  	MOVQ 144(AX), CX
  2291  	SUBQ CX, R11
  2292  	MOVQ R11, 168(AX)
  2293  
  2294  	// Return success
  2295  	MOVQ $0x00000000, ret+24(FP)
  2296  	RET
  2297  
  2298  	// Return with match length error
  2299  sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
  2300  	MOVQ 16(SP), AX
  2301  	MOVQ ctx+16(FP), CX
  2302  	MOVQ AX, 216(CX)
  2303  	MOVQ $0x00000001, ret+24(FP)
  2304  	RET
  2305  
  2306  	// Return with match too long error
  2307  sequenceDecs_decodeSync_amd64_error_match_len_too_big:
  2308  	MOVQ ctx+16(FP), AX
  2309  	MOVQ 16(SP), CX
  2310  	MOVQ CX, 216(AX)
  2311  	MOVQ $0x00000002, ret+24(FP)
  2312  	RET
  2313  
  2314  	// Return with match offset too long error
  2315  error_match_off_too_big:
  2316  	MOVQ ctx+16(FP), AX
  2317  	MOVQ 8(SP), CX
  2318  	MOVQ CX, 224(AX)
  2319  	MOVQ R12, 136(AX)
  2320  	MOVQ $0x00000003, ret+24(FP)
  2321  	RET
  2322  
  2323  	// Return with not enough literals error
  2324  error_not_enough_literals:
  2325  	MOVQ ctx+16(FP), AX
  2326  	MOVQ 24(SP), CX
  2327  	MOVQ CX, 208(AX)
  2328  	MOVQ $0x00000004, ret+24(FP)
  2329  	RET
  2330  
  2331  	// Return with overread error
  2332  error_overread:
  2333  	MOVQ $0x00000006, ret+24(FP)
  2334  	RET
  2335  
  2336  	// Return with not enough output space error
  2337  error_not_enough_space:
  2338  	MOVQ ctx+16(FP), AX
  2339  	MOVQ 24(SP), CX
  2340  	MOVQ CX, 208(AX)
  2341  	MOVQ 16(SP), CX
  2342  	MOVQ CX, 216(AX)
  2343  	MOVQ R12, 136(AX)
  2344  	MOVQ $0x00000005, ret+24(FP)
  2345  	RET
  2346  
  2347  // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2348  // Requires: BMI, BMI2, CMOV, SSE
  2349  TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
  2350  	MOVQ    br+8(FP), BX
  2351  	MOVQ    24(BX), AX
  2352  	MOVBQZX 32(BX), DX
  2353  	MOVQ    (BX), CX
  2354  	MOVQ    8(BX), BX
  2355  	ADDQ    BX, CX
  2356  	MOVQ    CX, (SP)
  2357  	MOVQ    ctx+16(FP), CX
  2358  	MOVQ    72(CX), SI
  2359  	MOVQ    80(CX), DI
  2360  	MOVQ    88(CX), R8
  2361  	XORQ    R9, R9
  2362  	MOVQ    R9, 8(SP)
  2363  	MOVQ    R9, 16(SP)
  2364  	MOVQ    R9, 24(SP)
  2365  	MOVQ    112(CX), R9
  2366  	MOVQ    128(CX), R10
  2367  	MOVQ    R10, 32(SP)
  2368  	MOVQ    144(CX), R10
  2369  	MOVQ    136(CX), R11
  2370  	MOVQ    200(CX), R12
  2371  	MOVQ    R12, 56(SP)
  2372  	MOVQ    176(CX), R12
  2373  	MOVQ    R12, 48(SP)
  2374  	MOVQ    184(CX), CX
  2375  	MOVQ    CX, 40(SP)
  2376  	MOVQ    40(SP), CX
  2377  	ADDQ    CX, 48(SP)
  2378  
  2379  	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2380  	ADDQ R9, 32(SP)
  2381  
  2382  	// outBase += outPosition
  2383  	ADDQ R11, R9
  2384  
  2385  sequenceDecs_decodeSync_bmi2_main_loop:
  2386  	MOVQ (SP), R12
  2387  
  2388  	// Fill bitreader to have enough for the offset and match length.
  2389  	CMPQ BX, $0x08
  2390  	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2391  	MOVQ DX, CX
  2392  	SHRQ $0x03, CX
  2393  	SUBQ CX, R12
  2394  	MOVQ (R12), AX
  2395  	SUBQ CX, BX
  2396  	ANDQ $0x07, DX
  2397  	JMP  sequenceDecs_decodeSync_bmi2_fill_end
  2398  
  2399  sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
  2400  	CMPQ    BX, $0x00
  2401  	JLE     sequenceDecs_decodeSync_bmi2_fill_check_overread
  2402  	CMPQ    DX, $0x07
  2403  	JLE     sequenceDecs_decodeSync_bmi2_fill_end
  2404  	SHLQ    $0x08, AX
  2405  	SUBQ    $0x01, R12
  2406  	SUBQ    $0x01, BX
  2407  	SUBQ    $0x08, DX
  2408  	MOVBQZX (R12), CX
  2409  	ORQ     CX, AX
  2410  	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2411  
  2412  sequenceDecs_decodeSync_bmi2_fill_check_overread:
  2413  	CMPQ DX, $0x40
  2414  	JA   error_overread
  2415  
  2416  sequenceDecs_decodeSync_bmi2_fill_end:
  2417  	// Update offset
  2418  	MOVQ   $0x00000808, CX
  2419  	BEXTRQ CX, R8, R13
  2420  	MOVQ   AX, R14
  2421  	LEAQ   (DX)(R13*1), CX
  2422  	ROLQ   CL, R14
  2423  	BZHIQ  R13, R14, R14
  2424  	MOVQ   CX, DX
  2425  	MOVQ   R8, CX
  2426  	SHRQ   $0x20, CX
  2427  	ADDQ   R14, CX
  2428  	MOVQ   CX, 8(SP)
  2429  
  2430  	// Update match length
  2431  	MOVQ   $0x00000808, CX
  2432  	BEXTRQ CX, DI, R13
  2433  	MOVQ   AX, R14
  2434  	LEAQ   (DX)(R13*1), CX
  2435  	ROLQ   CL, R14
  2436  	BZHIQ  R13, R14, R14
  2437  	MOVQ   CX, DX
  2438  	MOVQ   DI, CX
  2439  	SHRQ   $0x20, CX
  2440  	ADDQ   R14, CX
  2441  	MOVQ   CX, 16(SP)
  2442  
  2443  	// Fill bitreader to have enough for the remaining
  2444  	CMPQ BX, $0x08
  2445  	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2446  	MOVQ DX, CX
  2447  	SHRQ $0x03, CX
  2448  	SUBQ CX, R12
  2449  	MOVQ (R12), AX
  2450  	SUBQ CX, BX
  2451  	ANDQ $0x07, DX
  2452  	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
  2453  
  2454  sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
  2455  	CMPQ    BX, $0x00
  2456  	JLE     sequenceDecs_decodeSync_bmi2_fill_2_check_overread
  2457  	CMPQ    DX, $0x07
  2458  	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
  2459  	SHLQ    $0x08, AX
  2460  	SUBQ    $0x01, R12
  2461  	SUBQ    $0x01, BX
  2462  	SUBQ    $0x08, DX
  2463  	MOVBQZX (R12), CX
  2464  	ORQ     CX, AX
  2465  	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2466  
  2467  sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
  2468  	CMPQ DX, $0x40
  2469  	JA   error_overread
  2470  
  2471  sequenceDecs_decodeSync_bmi2_fill_2_end:
  2472  	// Update literal length
  2473  	MOVQ   $0x00000808, CX
  2474  	BEXTRQ CX, SI, R13
  2475  	MOVQ   AX, R14
  2476  	LEAQ   (DX)(R13*1), CX
  2477  	ROLQ   CL, R14
  2478  	BZHIQ  R13, R14, R14
  2479  	MOVQ   CX, DX
  2480  	MOVQ   SI, CX
  2481  	SHRQ   $0x20, CX
  2482  	ADDQ   R14, CX
  2483  	MOVQ   CX, 24(SP)
  2484  
  2485  	// Fill bitreader for state updates
  2486  	MOVQ    R12, (SP)
  2487  	MOVQ    $0x00000808, CX
  2488  	BEXTRQ  CX, R8, R12
  2489  	MOVQ    ctx+16(FP), CX
  2490  	CMPQ    96(CX), $0x00
  2491  	JZ      sequenceDecs_decodeSync_bmi2_skip_update
  2492  	LEAQ    (SI)(DI*1), R13
  2493  	ADDQ    R8, R13
  2494  	MOVBQZX R13, R13
  2495  	LEAQ    (DX)(R13*1), CX
  2496  	MOVQ    AX, R14
  2497  	MOVQ    CX, DX
  2498  	ROLQ    CL, R14
  2499  	BZHIQ   R13, R14, R14
  2500  
  2501  	// Update Offset State
  2502  	BZHIQ R8, R14, CX
  2503  	SHRXQ R8, R14, R14
  2504  	SHRL  $0x10, R8
  2505  	ADDQ  CX, R8
  2506  
  2507  	// Load ctx.ofTable
  2508  	MOVQ ctx+16(FP), CX
  2509  	MOVQ 48(CX), CX
  2510  	MOVQ (CX)(R8*8), R8
  2511  
  2512  	// Update Match Length State
  2513  	BZHIQ DI, R14, CX
  2514  	SHRXQ DI, R14, R14
  2515  	SHRL  $0x10, DI
  2516  	ADDQ  CX, DI
  2517  
  2518  	// Load ctx.mlTable
  2519  	MOVQ ctx+16(FP), CX
  2520  	MOVQ 24(CX), CX
  2521  	MOVQ (CX)(DI*8), DI
  2522  
  2523  	// Update Literal Length State
  2524  	BZHIQ SI, R14, CX
  2525  	SHRL  $0x10, SI
  2526  	ADDQ  CX, SI
  2527  
  2528  	// Load ctx.llTable
  2529  	MOVQ ctx+16(FP), CX
  2530  	MOVQ (CX), CX
  2531  	MOVQ (CX)(SI*8), SI
  2532  
  2533  sequenceDecs_decodeSync_bmi2_skip_update:
  2534  	// Adjust offset
  2535  	MOVQ   s+0(FP), CX
  2536  	MOVQ   8(SP), R13
  2537  	CMPQ   R12, $0x01
  2538  	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
  2539  	MOVUPS 144(CX), X0
  2540  	MOVQ   R13, 144(CX)
  2541  	MOVUPS X0, 152(CX)
  2542  	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
  2543  
  2544  sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
  2545  	CMPQ 24(SP), $0x00000000
  2546  	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
  2547  	INCQ R13
  2548  	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2549  
  2550  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
  2551  	TESTQ R13, R13
  2552  	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2553  	MOVQ  144(CX), R13
  2554  	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
  2555  
  2556  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
  2557  	MOVQ    R13, R12
  2558  	XORQ    R14, R14
  2559  	MOVQ    $-1, R15
  2560  	CMPQ    R13, $0x03
  2561  	CMOVQEQ R14, R12
  2562  	CMOVQEQ R15, R14
  2563  	ADDQ    144(CX)(R12*8), R14
  2564  	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
  2565  	MOVQ    $0x00000001, R14
  2566  
  2567  sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
  2568  	CMPQ R13, $0x01
  2569  	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
  2570  	MOVQ 152(CX), R12
  2571  	MOVQ R12, 160(CX)
  2572  
  2573  sequenceDecs_decodeSync_bmi2_adjust_skip:
  2574  	MOVQ 144(CX), R12
  2575  	MOVQ R12, 152(CX)
  2576  	MOVQ R14, 144(CX)
  2577  	MOVQ R14, R13
  2578  
  2579  sequenceDecs_decodeSync_bmi2_after_adjust:
  2580  	MOVQ R13, 8(SP)
  2581  
  2582  	// Check values
  2583  	MOVQ  16(SP), CX
  2584  	MOVQ  24(SP), R12
  2585  	LEAQ  (CX)(R12*1), R14
  2586  	MOVQ  s+0(FP), R15
  2587  	ADDQ  R14, 256(R15)
  2588  	MOVQ  ctx+16(FP), R14
  2589  	SUBQ  R12, 104(R14)
  2590  	JS    error_not_enough_literals
  2591  	CMPQ  CX, $0x00020002
  2592  	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
  2593  	TESTQ R13, R13
  2594  	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
  2595  	TESTQ CX, CX
  2596  	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
  2597  
  2598  sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
  2599  	MOVQ 24(SP), CX
  2600  	MOVQ 8(SP), R12
  2601  	MOVQ 16(SP), R13
  2602  
  2603  	// Check if we have enough space in s.out
  2604  	LEAQ (CX)(R13*1), R14
  2605  	ADDQ R9, R14
  2606  	CMPQ R14, 32(SP)
  2607  	JA   error_not_enough_space
  2608  
  2609  	// Copy literals
  2610  	TESTQ CX, CX
  2611  	JZ    check_offset
  2612  	XORQ  R14, R14
  2613  
  2614  copy_1:
  2615  	MOVUPS (R10)(R14*1), X0
  2616  	MOVUPS X0, (R9)(R14*1)
  2617  	ADDQ   $0x10, R14
  2618  	CMPQ   R14, CX
  2619  	JB     copy_1
  2620  	ADDQ   CX, R10
  2621  	ADDQ   CX, R9
  2622  	ADDQ   CX, R11
  2623  
  2624  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2625  check_offset:
  2626  	MOVQ R11, CX
  2627  	ADDQ 40(SP), CX
  2628  	CMPQ R12, CX
  2629  	JG   error_match_off_too_big
  2630  	CMPQ R12, 56(SP)
  2631  	JG   error_match_off_too_big
  2632  
  2633  	// Copy match from history
  2634  	MOVQ R12, CX
  2635  	SUBQ R11, CX
  2636  	JLS  copy_match
  2637  	MOVQ 48(SP), R14
  2638  	SUBQ CX, R14
  2639  	CMPQ R13, CX
  2640  	JG   copy_all_from_history
  2641  	MOVQ R13, CX
  2642  	SUBQ $0x10, CX
  2643  	JB   copy_4_small
  2644  
  2645  copy_4_loop:
  2646  	MOVUPS (R14), X0
  2647  	MOVUPS X0, (R9)
  2648  	ADDQ   $0x10, R14
  2649  	ADDQ   $0x10, R9
  2650  	SUBQ   $0x10, CX
  2651  	JAE    copy_4_loop
  2652  	LEAQ   16(R14)(CX*1), R14
  2653  	LEAQ   16(R9)(CX*1), R9
  2654  	MOVUPS -16(R14), X0
  2655  	MOVUPS X0, -16(R9)
  2656  	JMP    copy_4_end
  2657  
  2658  copy_4_small:
  2659  	CMPQ R13, $0x03
  2660  	JE   copy_4_move_3
  2661  	CMPQ R13, $0x08
  2662  	JB   copy_4_move_4through7
  2663  	JMP  copy_4_move_8through16
  2664  
  2665  copy_4_move_3:
  2666  	MOVW (R14), CX
  2667  	MOVB 2(R14), R12
  2668  	MOVW CX, (R9)
  2669  	MOVB R12, 2(R9)
  2670  	ADDQ R13, R14
  2671  	ADDQ R13, R9
  2672  	JMP  copy_4_end
  2673  
  2674  copy_4_move_4through7:
  2675  	MOVL (R14), CX
  2676  	MOVL -4(R14)(R13*1), R12
  2677  	MOVL CX, (R9)
  2678  	MOVL R12, -4(R9)(R13*1)
  2679  	ADDQ R13, R14
  2680  	ADDQ R13, R9
  2681  	JMP  copy_4_end
  2682  
  2683  copy_4_move_8through16:
  2684  	MOVQ (R14), CX
  2685  	MOVQ -8(R14)(R13*1), R12
  2686  	MOVQ CX, (R9)
  2687  	MOVQ R12, -8(R9)(R13*1)
  2688  	ADDQ R13, R14
  2689  	ADDQ R13, R9
  2690  
  2691  copy_4_end:
  2692  	ADDQ R13, R11
  2693  	JMP  handle_loop
  2694  	JMP loop_finished
  2695  
  2696  copy_all_from_history:
  2697  	MOVQ CX, R15
  2698  	SUBQ $0x10, R15
  2699  	JB   copy_5_small
  2700  
  2701  copy_5_loop:
  2702  	MOVUPS (R14), X0
  2703  	MOVUPS X0, (R9)
  2704  	ADDQ   $0x10, R14
  2705  	ADDQ   $0x10, R9
  2706  	SUBQ   $0x10, R15
  2707  	JAE    copy_5_loop
  2708  	LEAQ   16(R14)(R15*1), R14
  2709  	LEAQ   16(R9)(R15*1), R9
  2710  	MOVUPS -16(R14), X0
  2711  	MOVUPS X0, -16(R9)
  2712  	JMP    copy_5_end
  2713  
  2714  copy_5_small:
  2715  	CMPQ CX, $0x03
  2716  	JE   copy_5_move_3
  2717  	JB   copy_5_move_1or2
  2718  	CMPQ CX, $0x08
  2719  	JB   copy_5_move_4through7
  2720  	JMP  copy_5_move_8through16
  2721  
  2722  copy_5_move_1or2:
  2723  	MOVB (R14), R15
  2724  	MOVB -1(R14)(CX*1), BP
  2725  	MOVB R15, (R9)
  2726  	MOVB BP, -1(R9)(CX*1)
  2727  	ADDQ CX, R14
  2728  	ADDQ CX, R9
  2729  	JMP  copy_5_end
  2730  
  2731  copy_5_move_3:
  2732  	MOVW (R14), R15
  2733  	MOVB 2(R14), BP
  2734  	MOVW R15, (R9)
  2735  	MOVB BP, 2(R9)
  2736  	ADDQ CX, R14
  2737  	ADDQ CX, R9
  2738  	JMP  copy_5_end
  2739  
  2740  copy_5_move_4through7:
  2741  	MOVL (R14), R15
  2742  	MOVL -4(R14)(CX*1), BP
  2743  	MOVL R15, (R9)
  2744  	MOVL BP, -4(R9)(CX*1)
  2745  	ADDQ CX, R14
  2746  	ADDQ CX, R9
  2747  	JMP  copy_5_end
  2748  
  2749  copy_5_move_8through16:
  2750  	MOVQ (R14), R15
  2751  	MOVQ -8(R14)(CX*1), BP
  2752  	MOVQ R15, (R9)
  2753  	MOVQ BP, -8(R9)(CX*1)
  2754  	ADDQ CX, R14
  2755  	ADDQ CX, R9
  2756  
  2757  copy_5_end:
  2758  	ADDQ CX, R11
  2759  	SUBQ CX, R13
  2760  
  2761  	// Copy match from the current buffer
  2762  copy_match:
  2763  	MOVQ R9, CX
  2764  	SUBQ R12, CX
  2765  
  2766  	// ml <= mo
  2767  	CMPQ R13, R12
  2768  	JA   copy_overlapping_match
  2769  
  2770  	// Copy non-overlapping match
  2771  	ADDQ R13, R11
  2772  	MOVQ R9, R12
  2773  	ADDQ R13, R9
  2774  
  2775  copy_2:
  2776  	MOVUPS (CX), X0
  2777  	MOVUPS X0, (R12)
  2778  	ADDQ   $0x10, CX
  2779  	ADDQ   $0x10, R12
  2780  	SUBQ   $0x10, R13
  2781  	JHI    copy_2
  2782  	JMP    handle_loop
  2783  
  2784  	// Copy overlapping match
  2785  copy_overlapping_match:
  2786  	ADDQ R13, R11
  2787  
  2788  copy_slow_3:
  2789  	MOVB (CX), R12
  2790  	MOVB R12, (R9)
  2791  	INCQ CX
  2792  	INCQ R9
  2793  	DECQ R13
  2794  	JNZ  copy_slow_3
  2795  
  2796  handle_loop:
  2797  	MOVQ ctx+16(FP), CX
  2798  	DECQ 96(CX)
  2799  	JNS  sequenceDecs_decodeSync_bmi2_main_loop
  2800  
  2801  loop_finished:
  2802  	MOVQ br+8(FP), CX
  2803  	MOVQ AX, 24(CX)
  2804  	MOVB DL, 32(CX)
  2805  	MOVQ BX, 8(CX)
  2806  
  2807  	// Update the context
  2808  	MOVQ ctx+16(FP), AX
  2809  	MOVQ R11, 136(AX)
  2810  	MOVQ 144(AX), CX
  2811  	SUBQ CX, R10
  2812  	MOVQ R10, 168(AX)
  2813  
  2814  	// Return success
  2815  	MOVQ $0x00000000, ret+24(FP)
  2816  	RET
  2817  
  2818  	// Return with match length error
  2819  sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
  2820  	MOVQ 16(SP), AX
  2821  	MOVQ ctx+16(FP), CX
  2822  	MOVQ AX, 216(CX)
  2823  	MOVQ $0x00000001, ret+24(FP)
  2824  	RET
  2825  
  2826  	// Return with match too long error
  2827  sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
  2828  	MOVQ ctx+16(FP), AX
  2829  	MOVQ 16(SP), CX
  2830  	MOVQ CX, 216(AX)
  2831  	MOVQ $0x00000002, ret+24(FP)
  2832  	RET
  2833  
  2834  	// Return with match offset too long error
  2835  error_match_off_too_big:
  2836  	MOVQ ctx+16(FP), AX
  2837  	MOVQ 8(SP), CX
  2838  	MOVQ CX, 224(AX)
  2839  	MOVQ R11, 136(AX)
  2840  	MOVQ $0x00000003, ret+24(FP)
  2841  	RET
  2842  
  2843  	// Return with not enough literals error
  2844  error_not_enough_literals:
  2845  	MOVQ ctx+16(FP), AX
  2846  	MOVQ 24(SP), CX
  2847  	MOVQ CX, 208(AX)
  2848  	MOVQ $0x00000004, ret+24(FP)
  2849  	RET
  2850  
  2851  	// Return with overread error
  2852  error_overread:
  2853  	MOVQ $0x00000006, ret+24(FP)
  2854  	RET
  2855  
  2856  	// Return with not enough output space error
  2857  error_not_enough_space:
  2858  	MOVQ ctx+16(FP), AX
  2859  	MOVQ 24(SP), CX
  2860  	MOVQ CX, 208(AX)
  2861  	MOVQ 16(SP), CX
  2862  	MOVQ CX, 216(AX)
  2863  	MOVQ R11, 136(AX)
  2864  	MOVQ $0x00000005, ret+24(FP)
  2865  	RET
  2866  
  2867  // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2868  // Requires: CMOV, SSE
  2869  TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
  2870  	MOVQ    br+8(FP), CX
  2871  	MOVQ    24(CX), DX
  2872  	MOVBQZX 32(CX), BX
  2873  	MOVQ    (CX), AX
  2874  	MOVQ    8(CX), SI
  2875  	ADDQ    SI, AX
  2876  	MOVQ    AX, (SP)
  2877  	MOVQ    ctx+16(FP), AX
  2878  	MOVQ    72(AX), DI
  2879  	MOVQ    80(AX), R8
  2880  	MOVQ    88(AX), R9
  2881  	XORQ    CX, CX
  2882  	MOVQ    CX, 8(SP)
  2883  	MOVQ    CX, 16(SP)
  2884  	MOVQ    CX, 24(SP)
  2885  	MOVQ    112(AX), R10
  2886  	MOVQ    128(AX), CX
  2887  	MOVQ    CX, 32(SP)
  2888  	MOVQ    144(AX), R11
  2889  	MOVQ    136(AX), R12
  2890  	MOVQ    200(AX), CX
  2891  	MOVQ    CX, 56(SP)
  2892  	MOVQ    176(AX), CX
  2893  	MOVQ    CX, 48(SP)
  2894  	MOVQ    184(AX), AX
  2895  	MOVQ    AX, 40(SP)
  2896  	MOVQ    40(SP), AX
  2897  	ADDQ    AX, 48(SP)
  2898  
  2899  	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2900  	ADDQ R10, 32(SP)
  2901  
  2902  	// outBase += outPosition
  2903  	ADDQ R12, R10
  2904  
  2905  sequenceDecs_decodeSync_safe_amd64_main_loop:
  2906  	MOVQ (SP), R13
  2907  
  2908  	// Fill bitreader to have enough for the offset and match length.
  2909  	CMPQ SI, $0x08
  2910  	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2911  	MOVQ BX, AX
  2912  	SHRQ $0x03, AX
  2913  	SUBQ AX, R13
  2914  	MOVQ (R13), DX
  2915  	SUBQ AX, SI
  2916  	ANDQ $0x07, BX
  2917  	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
  2918  
  2919  sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
  2920  	CMPQ    SI, $0x00
  2921  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_check_overread
  2922  	CMPQ    BX, $0x07
  2923  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
  2924  	SHLQ    $0x08, DX
  2925  	SUBQ    $0x01, R13
  2926  	SUBQ    $0x01, SI
  2927  	SUBQ    $0x08, BX
  2928  	MOVBQZX (R13), AX
  2929  	ORQ     AX, DX
  2930  	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2931  
  2932  sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
  2933  	CMPQ BX, $0x40
  2934  	JA   error_overread
  2935  
  2936  sequenceDecs_decodeSync_safe_amd64_fill_end:
  2937  	// Update offset
  2938  	MOVQ  R9, AX
  2939  	MOVQ  BX, CX
  2940  	MOVQ  DX, R14
  2941  	SHLQ  CL, R14
  2942  	MOVB  AH, CL
  2943  	SHRQ  $0x20, AX
  2944  	TESTQ CX, CX
  2945  	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2946  	ADDQ  CX, BX
  2947  	CMPQ  BX, $0x40
  2948  	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2949  	CMPQ  CX, $0x40
  2950  	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2951  	NEGQ  CX
  2952  	SHRQ  CL, R14
  2953  	ADDQ  R14, AX
  2954  
  2955  sequenceDecs_decodeSync_safe_amd64_of_update_zero:
  2956  	MOVQ AX, 8(SP)
  2957  
  2958  	// Update match length
  2959  	MOVQ  R8, AX
  2960  	MOVQ  BX, CX
  2961  	MOVQ  DX, R14
  2962  	SHLQ  CL, R14
  2963  	MOVB  AH, CL
  2964  	SHRQ  $0x20, AX
  2965  	TESTQ CX, CX
  2966  	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2967  	ADDQ  CX, BX
  2968  	CMPQ  BX, $0x40
  2969  	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2970  	CMPQ  CX, $0x40
  2971  	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2972  	NEGQ  CX
  2973  	SHRQ  CL, R14
  2974  	ADDQ  R14, AX
  2975  
  2976  sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
  2977  	MOVQ AX, 16(SP)
  2978  
  2979  	// Fill bitreader to have enough for the remaining
  2980  	CMPQ SI, $0x08
  2981  	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2982  	MOVQ BX, AX
  2983  	SHRQ $0x03, AX
  2984  	SUBQ AX, R13
  2985  	MOVQ (R13), DX
  2986  	SUBQ AX, SI
  2987  	ANDQ $0x07, BX
  2988  	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2989  
  2990  sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
  2991  	CMPQ    SI, $0x00
  2992  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
  2993  	CMPQ    BX, $0x07
  2994  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2995  	SHLQ    $0x08, DX
  2996  	SUBQ    $0x01, R13
  2997  	SUBQ    $0x01, SI
  2998  	SUBQ    $0x08, BX
  2999  	MOVBQZX (R13), AX
  3000  	ORQ     AX, DX
  3001  	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  3002  
  3003  sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
  3004  	CMPQ BX, $0x40
  3005  	JA   error_overread
  3006  
  3007  sequenceDecs_decodeSync_safe_amd64_fill_2_end:
  3008  	// Update literal length
  3009  	MOVQ  DI, AX
  3010  	MOVQ  BX, CX
  3011  	MOVQ  DX, R14
  3012  	SHLQ  CL, R14
  3013  	MOVB  AH, CL
  3014  	SHRQ  $0x20, AX
  3015  	TESTQ CX, CX
  3016  	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3017  	ADDQ  CX, BX
  3018  	CMPQ  BX, $0x40
  3019  	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3020  	CMPQ  CX, $0x40
  3021  	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3022  	NEGQ  CX
  3023  	SHRQ  CL, R14
  3024  	ADDQ  R14, AX
  3025  
  3026  sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
  3027  	MOVQ AX, 24(SP)
  3028  
  3029  	// Fill bitreader for state updates
  3030  	MOVQ    R13, (SP)
  3031  	MOVQ    R9, AX
  3032  	SHRQ    $0x08, AX
  3033  	MOVBQZX AL, AX
  3034  	MOVQ    ctx+16(FP), CX
  3035  	CMPQ    96(CX), $0x00
  3036  	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
  3037  
  3038  	// Update Literal Length State
  3039  	MOVBQZX DI, R13
  3040  	SHRL    $0x10, DI
  3041  	LEAQ    (BX)(R13*1), CX
  3042  	MOVQ    DX, R14
  3043  	MOVQ    CX, BX
  3044  	ROLQ    CL, R14
  3045  	MOVL    $0x00000001, R15
  3046  	MOVB    R13, CL
  3047  	SHLL    CL, R15
  3048  	DECL    R15
  3049  	ANDQ    R15, R14
  3050  	ADDQ    R14, DI
  3051  
  3052  	// Load ctx.llTable
  3053  	MOVQ ctx+16(FP), CX
  3054  	MOVQ (CX), CX
  3055  	MOVQ (CX)(DI*8), DI
  3056  
  3057  	// Update Match Length State
  3058  	MOVBQZX R8, R13
  3059  	SHRL    $0x10, R8
  3060  	LEAQ    (BX)(R13*1), CX
  3061  	MOVQ    DX, R14
  3062  	MOVQ    CX, BX
  3063  	ROLQ    CL, R14
  3064  	MOVL    $0x00000001, R15
  3065  	MOVB    R13, CL
  3066  	SHLL    CL, R15
  3067  	DECL    R15
  3068  	ANDQ    R15, R14
  3069  	ADDQ    R14, R8
  3070  
  3071  	// Load ctx.mlTable
  3072  	MOVQ ctx+16(FP), CX
  3073  	MOVQ 24(CX), CX
  3074  	MOVQ (CX)(R8*8), R8
  3075  
  3076  	// Update Offset State
  3077  	MOVBQZX R9, R13
  3078  	SHRL    $0x10, R9
  3079  	LEAQ    (BX)(R13*1), CX
  3080  	MOVQ    DX, R14
  3081  	MOVQ    CX, BX
  3082  	ROLQ    CL, R14
  3083  	MOVL    $0x00000001, R15
  3084  	MOVB    R13, CL
  3085  	SHLL    CL, R15
  3086  	DECL    R15
  3087  	ANDQ    R15, R14
  3088  	ADDQ    R14, R9
  3089  
  3090  	// Load ctx.ofTable
  3091  	MOVQ ctx+16(FP), CX
  3092  	MOVQ 48(CX), CX
  3093  	MOVQ (CX)(R9*8), R9
  3094  
  3095  sequenceDecs_decodeSync_safe_amd64_skip_update:
  3096  	// Adjust offset
  3097  	MOVQ   s+0(FP), CX
  3098  	MOVQ   8(SP), R13
  3099  	CMPQ   AX, $0x01
  3100  	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
  3101  	MOVUPS 144(CX), X0
  3102  	MOVQ   R13, 144(CX)
  3103  	MOVUPS X0, 152(CX)
  3104  	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
  3105  
  3106  sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
  3107  	CMPQ 24(SP), $0x00000000
  3108  	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
  3109  	INCQ R13
  3110  	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  3111  
  3112  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
  3113  	TESTQ R13, R13
  3114  	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  3115  	MOVQ  144(CX), R13
  3116  	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
  3117  
  3118  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
  3119  	MOVQ    R13, AX
  3120  	XORQ    R14, R14
  3121  	MOVQ    $-1, R15
  3122  	CMPQ    R13, $0x03
  3123  	CMOVQEQ R14, AX
  3124  	CMOVQEQ R15, R14
  3125  	ADDQ    144(CX)(AX*8), R14
  3126  	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
  3127  	MOVQ    $0x00000001, R14
  3128  
  3129  sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
  3130  	CMPQ R13, $0x01
  3131  	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
  3132  	MOVQ 152(CX), AX
  3133  	MOVQ AX, 160(CX)
  3134  
  3135  sequenceDecs_decodeSync_safe_amd64_adjust_skip:
  3136  	MOVQ 144(CX), AX
  3137  	MOVQ AX, 152(CX)
  3138  	MOVQ R14, 144(CX)
  3139  	MOVQ R14, R13
  3140  
  3141  sequenceDecs_decodeSync_safe_amd64_after_adjust:
  3142  	MOVQ R13, 8(SP)
  3143  
  3144  	// Check values
  3145  	MOVQ  16(SP), AX
  3146  	MOVQ  24(SP), CX
  3147  	LEAQ  (AX)(CX*1), R14
  3148  	MOVQ  s+0(FP), R15
  3149  	ADDQ  R14, 256(R15)
  3150  	MOVQ  ctx+16(FP), R14
  3151  	SUBQ  CX, 104(R14)
  3152  	JS    error_not_enough_literals
  3153  	CMPQ  AX, $0x00020002
  3154  	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
  3155  	TESTQ R13, R13
  3156  	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
  3157  	TESTQ AX, AX
  3158  	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
  3159  
  3160  sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
  3161  	MOVQ 24(SP), AX
  3162  	MOVQ 8(SP), CX
  3163  	MOVQ 16(SP), R13
  3164  
  3165  	// Check if we have enough space in s.out
  3166  	LEAQ (AX)(R13*1), R14
  3167  	ADDQ R10, R14
  3168  	CMPQ R14, 32(SP)
  3169  	JA   error_not_enough_space
  3170  
  3171  	// Copy literals
  3172  	TESTQ AX, AX
  3173  	JZ    check_offset
  3174  	MOVQ  AX, R14
  3175  	SUBQ  $0x10, R14
  3176  	JB    copy_1_small
  3177  
  3178  copy_1_loop:
  3179  	MOVUPS (R11), X0
  3180  	MOVUPS X0, (R10)
  3181  	ADDQ   $0x10, R11
  3182  	ADDQ   $0x10, R10
  3183  	SUBQ   $0x10, R14
  3184  	JAE    copy_1_loop
  3185  	LEAQ   16(R11)(R14*1), R11
  3186  	LEAQ   16(R10)(R14*1), R10
  3187  	MOVUPS -16(R11), X0
  3188  	MOVUPS X0, -16(R10)
  3189  	JMP    copy_1_end
  3190  
  3191  copy_1_small:
  3192  	CMPQ AX, $0x03
  3193  	JE   copy_1_move_3
  3194  	JB   copy_1_move_1or2
  3195  	CMPQ AX, $0x08
  3196  	JB   copy_1_move_4through7
  3197  	JMP  copy_1_move_8through16
  3198  
  3199  copy_1_move_1or2:
  3200  	MOVB (R11), R14
  3201  	MOVB -1(R11)(AX*1), R15
  3202  	MOVB R14, (R10)
  3203  	MOVB R15, -1(R10)(AX*1)
  3204  	ADDQ AX, R11
  3205  	ADDQ AX, R10
  3206  	JMP  copy_1_end
  3207  
  3208  copy_1_move_3:
  3209  	MOVW (R11), R14
  3210  	MOVB 2(R11), R15
  3211  	MOVW R14, (R10)
  3212  	MOVB R15, 2(R10)
  3213  	ADDQ AX, R11
  3214  	ADDQ AX, R10
  3215  	JMP  copy_1_end
  3216  
  3217  copy_1_move_4through7:
  3218  	MOVL (R11), R14
  3219  	MOVL -4(R11)(AX*1), R15
  3220  	MOVL R14, (R10)
  3221  	MOVL R15, -4(R10)(AX*1)
  3222  	ADDQ AX, R11
  3223  	ADDQ AX, R10
  3224  	JMP  copy_1_end
  3225  
  3226  copy_1_move_8through16:
  3227  	MOVQ (R11), R14
  3228  	MOVQ -8(R11)(AX*1), R15
  3229  	MOVQ R14, (R10)
  3230  	MOVQ R15, -8(R10)(AX*1)
  3231  	ADDQ AX, R11
  3232  	ADDQ AX, R10
  3233  
  3234  copy_1_end:
  3235  	ADDQ AX, R12
  3236  
  3237  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3238  check_offset:
  3239  	MOVQ R12, AX
  3240  	ADDQ 40(SP), AX
  3241  	CMPQ CX, AX
  3242  	JG   error_match_off_too_big
  3243  	CMPQ CX, 56(SP)
  3244  	JG   error_match_off_too_big
  3245  
  3246  	// Copy match from history
  3247  	MOVQ CX, AX
  3248  	SUBQ R12, AX
  3249  	JLS  copy_match
  3250  	MOVQ 48(SP), R14
  3251  	SUBQ AX, R14
  3252  	CMPQ R13, AX
  3253  	JG   copy_all_from_history
  3254  	MOVQ R13, AX
  3255  	SUBQ $0x10, AX
  3256  	JB   copy_4_small
  3257  
  3258  copy_4_loop:
  3259  	MOVUPS (R14), X0
  3260  	MOVUPS X0, (R10)
  3261  	ADDQ   $0x10, R14
  3262  	ADDQ   $0x10, R10
  3263  	SUBQ   $0x10, AX
  3264  	JAE    copy_4_loop
  3265  	LEAQ   16(R14)(AX*1), R14
  3266  	LEAQ   16(R10)(AX*1), R10
  3267  	MOVUPS -16(R14), X0
  3268  	MOVUPS X0, -16(R10)
  3269  	JMP    copy_4_end
  3270  
  3271  copy_4_small:
  3272  	CMPQ R13, $0x03
  3273  	JE   copy_4_move_3
  3274  	CMPQ R13, $0x08
  3275  	JB   copy_4_move_4through7
  3276  	JMP  copy_4_move_8through16
  3277  
  3278  copy_4_move_3:
  3279  	MOVW (R14), AX
  3280  	MOVB 2(R14), CL
  3281  	MOVW AX, (R10)
  3282  	MOVB CL, 2(R10)
  3283  	ADDQ R13, R14
  3284  	ADDQ R13, R10
  3285  	JMP  copy_4_end
  3286  
  3287  copy_4_move_4through7:
  3288  	MOVL (R14), AX
  3289  	MOVL -4(R14)(R13*1), CX
  3290  	MOVL AX, (R10)
  3291  	MOVL CX, -4(R10)(R13*1)
  3292  	ADDQ R13, R14
  3293  	ADDQ R13, R10
  3294  	JMP  copy_4_end
  3295  
  3296  copy_4_move_8through16:
  3297  	MOVQ (R14), AX
  3298  	MOVQ -8(R14)(R13*1), CX
  3299  	MOVQ AX, (R10)
  3300  	MOVQ CX, -8(R10)(R13*1)
  3301  	ADDQ R13, R14
  3302  	ADDQ R13, R10
  3303  
  3304  copy_4_end:
  3305  	ADDQ R13, R12
  3306  	JMP  handle_loop
  3307  	JMP loop_finished
  3308  
  3309  copy_all_from_history:
  3310  	MOVQ AX, R15
  3311  	SUBQ $0x10, R15
  3312  	JB   copy_5_small
  3313  
  3314  copy_5_loop:
  3315  	MOVUPS (R14), X0
  3316  	MOVUPS X0, (R10)
  3317  	ADDQ   $0x10, R14
  3318  	ADDQ   $0x10, R10
  3319  	SUBQ   $0x10, R15
  3320  	JAE    copy_5_loop
  3321  	LEAQ   16(R14)(R15*1), R14
  3322  	LEAQ   16(R10)(R15*1), R10
  3323  	MOVUPS -16(R14), X0
  3324  	MOVUPS X0, -16(R10)
  3325  	JMP    copy_5_end
  3326  
  3327  copy_5_small:
  3328  	CMPQ AX, $0x03
  3329  	JE   copy_5_move_3
  3330  	JB   copy_5_move_1or2
  3331  	CMPQ AX, $0x08
  3332  	JB   copy_5_move_4through7
  3333  	JMP  copy_5_move_8through16
  3334  
  3335  copy_5_move_1or2:
  3336  	MOVB (R14), R15
  3337  	MOVB -1(R14)(AX*1), BP
  3338  	MOVB R15, (R10)
  3339  	MOVB BP, -1(R10)(AX*1)
  3340  	ADDQ AX, R14
  3341  	ADDQ AX, R10
  3342  	JMP  copy_5_end
  3343  
  3344  copy_5_move_3:
  3345  	MOVW (R14), R15
  3346  	MOVB 2(R14), BP
  3347  	MOVW R15, (R10)
  3348  	MOVB BP, 2(R10)
  3349  	ADDQ AX, R14
  3350  	ADDQ AX, R10
  3351  	JMP  copy_5_end
  3352  
  3353  copy_5_move_4through7:
  3354  	MOVL (R14), R15
  3355  	MOVL -4(R14)(AX*1), BP
  3356  	MOVL R15, (R10)
  3357  	MOVL BP, -4(R10)(AX*1)
  3358  	ADDQ AX, R14
  3359  	ADDQ AX, R10
  3360  	JMP  copy_5_end
  3361  
  3362  copy_5_move_8through16:
  3363  	MOVQ (R14), R15
  3364  	MOVQ -8(R14)(AX*1), BP
  3365  	MOVQ R15, (R10)
  3366  	MOVQ BP, -8(R10)(AX*1)
  3367  	ADDQ AX, R14
  3368  	ADDQ AX, R10
  3369  
  3370  copy_5_end:
  3371  	ADDQ AX, R12
  3372  	SUBQ AX, R13
  3373  
  3374  	// Copy match from the current buffer
  3375  copy_match:
  3376  	MOVQ R10, AX
  3377  	SUBQ CX, AX
  3378  
  3379  	// ml <= mo
  3380  	CMPQ R13, CX
  3381  	JA   copy_overlapping_match
  3382  
  3383  	// Copy non-overlapping match
  3384  	ADDQ R13, R12
  3385  	MOVQ R13, CX
  3386  	SUBQ $0x10, CX
  3387  	JB   copy_2_small
  3388  
  3389  copy_2_loop:
  3390  	MOVUPS (AX), X0
  3391  	MOVUPS X0, (R10)
  3392  	ADDQ   $0x10, AX
  3393  	ADDQ   $0x10, R10
  3394  	SUBQ   $0x10, CX
  3395  	JAE    copy_2_loop
  3396  	LEAQ   16(AX)(CX*1), AX
  3397  	LEAQ   16(R10)(CX*1), R10
  3398  	MOVUPS -16(AX), X0
  3399  	MOVUPS X0, -16(R10)
  3400  	JMP    copy_2_end
  3401  
  3402  copy_2_small:
  3403  	CMPQ R13, $0x03
  3404  	JE   copy_2_move_3
  3405  	JB   copy_2_move_1or2
  3406  	CMPQ R13, $0x08
  3407  	JB   copy_2_move_4through7
  3408  	JMP  copy_2_move_8through16
  3409  
  3410  copy_2_move_1or2:
  3411  	MOVB (AX), CL
  3412  	MOVB -1(AX)(R13*1), R14
  3413  	MOVB CL, (R10)
  3414  	MOVB R14, -1(R10)(R13*1)
  3415  	ADDQ R13, AX
  3416  	ADDQ R13, R10
  3417  	JMP  copy_2_end
  3418  
  3419  copy_2_move_3:
  3420  	MOVW (AX), CX
  3421  	MOVB 2(AX), R14
  3422  	MOVW CX, (R10)
  3423  	MOVB R14, 2(R10)
  3424  	ADDQ R13, AX
  3425  	ADDQ R13, R10
  3426  	JMP  copy_2_end
  3427  
  3428  copy_2_move_4through7:
  3429  	MOVL (AX), CX
  3430  	MOVL -4(AX)(R13*1), R14
  3431  	MOVL CX, (R10)
  3432  	MOVL R14, -4(R10)(R13*1)
  3433  	ADDQ R13, AX
  3434  	ADDQ R13, R10
  3435  	JMP  copy_2_end
  3436  
  3437  copy_2_move_8through16:
  3438  	MOVQ (AX), CX
  3439  	MOVQ -8(AX)(R13*1), R14
  3440  	MOVQ CX, (R10)
  3441  	MOVQ R14, -8(R10)(R13*1)
  3442  	ADDQ R13, AX
  3443  	ADDQ R13, R10
  3444  
  3445  copy_2_end:
  3446  	JMP handle_loop
  3447  
  3448  	// Copy overlapping match
  3449  copy_overlapping_match:
  3450  	ADDQ R13, R12
  3451  
  3452  copy_slow_3:
  3453  	MOVB (AX), CL
  3454  	MOVB CL, (R10)
  3455  	INCQ AX
  3456  	INCQ R10
  3457  	DECQ R13
  3458  	JNZ  copy_slow_3
  3459  
  3460  handle_loop:
  3461  	MOVQ ctx+16(FP), AX
  3462  	DECQ 96(AX)
  3463  	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
  3464  
  3465  loop_finished:
  3466  	MOVQ br+8(FP), AX
  3467  	MOVQ DX, 24(AX)
  3468  	MOVB BL, 32(AX)
  3469  	MOVQ SI, 8(AX)
  3470  
  3471  	// Update the context
  3472  	MOVQ ctx+16(FP), AX
  3473  	MOVQ R12, 136(AX)
  3474  	MOVQ 144(AX), CX
  3475  	SUBQ CX, R11
  3476  	MOVQ R11, 168(AX)
  3477  
  3478  	// Return success
  3479  	MOVQ $0x00000000, ret+24(FP)
  3480  	RET
  3481  
  3482  	// Return with match length error
  3483  sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
  3484  	MOVQ 16(SP), AX
  3485  	MOVQ ctx+16(FP), CX
  3486  	MOVQ AX, 216(CX)
  3487  	MOVQ $0x00000001, ret+24(FP)
  3488  	RET
  3489  
  3490  	// Return with match too long error
  3491  sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
  3492  	MOVQ ctx+16(FP), AX
  3493  	MOVQ 16(SP), CX
  3494  	MOVQ CX, 216(AX)
  3495  	MOVQ $0x00000002, ret+24(FP)
  3496  	RET
  3497  
  3498  	// Return with match offset too long error
  3499  error_match_off_too_big:
  3500  	MOVQ ctx+16(FP), AX
  3501  	MOVQ 8(SP), CX
  3502  	MOVQ CX, 224(AX)
  3503  	MOVQ R12, 136(AX)
  3504  	MOVQ $0x00000003, ret+24(FP)
  3505  	RET
  3506  
  3507  	// Return with not enough literals error
  3508  error_not_enough_literals:
  3509  	MOVQ ctx+16(FP), AX
  3510  	MOVQ 24(SP), CX
  3511  	MOVQ CX, 208(AX)
  3512  	MOVQ $0x00000004, ret+24(FP)
  3513  	RET
  3514  
  3515  	// Return with overread error
  3516  error_overread:
  3517  	MOVQ $0x00000006, ret+24(FP)
  3518  	RET
  3519  
  3520  	// Return with not enough output space error
  3521  error_not_enough_space:
  3522  	MOVQ ctx+16(FP), AX
  3523  	MOVQ 24(SP), CX
  3524  	MOVQ CX, 208(AX)
  3525  	MOVQ 16(SP), CX
  3526  	MOVQ CX, 216(AX)
  3527  	MOVQ R12, 136(AX)
  3528  	MOVQ $0x00000005, ret+24(FP)
  3529  	RET
  3530  
  3531  // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  3532  // Requires: BMI, BMI2, CMOV, SSE
  3533  TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
  3534  	MOVQ    br+8(FP), BX
  3535  	MOVQ    24(BX), AX
  3536  	MOVBQZX 32(BX), DX
  3537  	MOVQ    (BX), CX
  3538  	MOVQ    8(BX), BX
  3539  	ADDQ    BX, CX
  3540  	MOVQ    CX, (SP)
  3541  	MOVQ    ctx+16(FP), CX
  3542  	MOVQ    72(CX), SI
  3543  	MOVQ    80(CX), DI
  3544  	MOVQ    88(CX), R8
  3545  	XORQ    R9, R9
  3546  	MOVQ    R9, 8(SP)
  3547  	MOVQ    R9, 16(SP)
  3548  	MOVQ    R9, 24(SP)
  3549  	MOVQ    112(CX), R9
  3550  	MOVQ    128(CX), R10
  3551  	MOVQ    R10, 32(SP)
  3552  	MOVQ    144(CX), R10
  3553  	MOVQ    136(CX), R11
  3554  	MOVQ    200(CX), R12
  3555  	MOVQ    R12, 56(SP)
  3556  	MOVQ    176(CX), R12
  3557  	MOVQ    R12, 48(SP)
  3558  	MOVQ    184(CX), CX
  3559  	MOVQ    CX, 40(SP)
  3560  	MOVQ    40(SP), CX
  3561  	ADDQ    CX, 48(SP)
  3562  
  3563  	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  3564  	ADDQ R9, 32(SP)
  3565  
  3566  	// outBase += outPosition
  3567  	ADDQ R11, R9
  3568  
  3569  sequenceDecs_decodeSync_safe_bmi2_main_loop:
  3570  	MOVQ (SP), R12
  3571  
  3572  	// Fill bitreader to have enough for the offset and match length.
  3573  	CMPQ BX, $0x08
  3574  	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3575  	MOVQ DX, CX
  3576  	SHRQ $0x03, CX
  3577  	SUBQ CX, R12
  3578  	MOVQ (R12), AX
  3579  	SUBQ CX, BX
  3580  	ANDQ $0x07, DX
  3581  	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
  3582  
  3583  sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
  3584  	CMPQ    BX, $0x00
  3585  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
  3586  	CMPQ    DX, $0x07
  3587  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
  3588  	SHLQ    $0x08, AX
  3589  	SUBQ    $0x01, R12
  3590  	SUBQ    $0x01, BX
  3591  	SUBQ    $0x08, DX
  3592  	MOVBQZX (R12), CX
  3593  	ORQ     CX, AX
  3594  	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3595  
  3596  sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
  3597  	CMPQ DX, $0x40
  3598  	JA   error_overread
  3599  
  3600  sequenceDecs_decodeSync_safe_bmi2_fill_end:
  3601  	// Update offset
  3602  	MOVQ   $0x00000808, CX
  3603  	BEXTRQ CX, R8, R13
  3604  	MOVQ   AX, R14
  3605  	LEAQ   (DX)(R13*1), CX
  3606  	ROLQ   CL, R14
  3607  	BZHIQ  R13, R14, R14
  3608  	MOVQ   CX, DX
  3609  	MOVQ   R8, CX
  3610  	SHRQ   $0x20, CX
  3611  	ADDQ   R14, CX
  3612  	MOVQ   CX, 8(SP)
  3613  
  3614  	// Update match length
  3615  	MOVQ   $0x00000808, CX
  3616  	BEXTRQ CX, DI, R13
  3617  	MOVQ   AX, R14
  3618  	LEAQ   (DX)(R13*1), CX
  3619  	ROLQ   CL, R14
  3620  	BZHIQ  R13, R14, R14
  3621  	MOVQ   CX, DX
  3622  	MOVQ   DI, CX
  3623  	SHRQ   $0x20, CX
  3624  	ADDQ   R14, CX
  3625  	MOVQ   CX, 16(SP)
  3626  
  3627  	// Fill bitreader to have enough for the remaining
  3628  	CMPQ BX, $0x08
  3629  	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3630  	MOVQ DX, CX
  3631  	SHRQ $0x03, CX
  3632  	SUBQ CX, R12
  3633  	MOVQ (R12), AX
  3634  	SUBQ CX, BX
  3635  	ANDQ $0x07, DX
  3636  	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3637  
  3638  sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
  3639  	CMPQ    BX, $0x00
  3640  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
  3641  	CMPQ    DX, $0x07
  3642  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3643  	SHLQ    $0x08, AX
  3644  	SUBQ    $0x01, R12
  3645  	SUBQ    $0x01, BX
  3646  	SUBQ    $0x08, DX
  3647  	MOVBQZX (R12), CX
  3648  	ORQ     CX, AX
  3649  	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3650  
  3651  sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
  3652  	CMPQ DX, $0x40
  3653  	JA   error_overread
  3654  
  3655  sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
  3656  	// Update literal length
  3657  	MOVQ   $0x00000808, CX
  3658  	BEXTRQ CX, SI, R13
  3659  	MOVQ   AX, R14
  3660  	LEAQ   (DX)(R13*1), CX
  3661  	ROLQ   CL, R14
  3662  	BZHIQ  R13, R14, R14
  3663  	MOVQ   CX, DX
  3664  	MOVQ   SI, CX
  3665  	SHRQ   $0x20, CX
  3666  	ADDQ   R14, CX
  3667  	MOVQ   CX, 24(SP)
  3668  
  3669  	// Fill bitreader for state updates
  3670  	MOVQ    R12, (SP)
  3671  	MOVQ    $0x00000808, CX
  3672  	BEXTRQ  CX, R8, R12
  3673  	MOVQ    ctx+16(FP), CX
  3674  	CMPQ    96(CX), $0x00
  3675  	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
  3676  	LEAQ    (SI)(DI*1), R13
  3677  	ADDQ    R8, R13
  3678  	MOVBQZX R13, R13
  3679  	LEAQ    (DX)(R13*1), CX
  3680  	MOVQ    AX, R14
  3681  	MOVQ    CX, DX
  3682  	ROLQ    CL, R14
  3683  	BZHIQ   R13, R14, R14
  3684  
  3685  	// Update Offset State
  3686  	BZHIQ R8, R14, CX
  3687  	SHRXQ R8, R14, R14
  3688  	SHRL  $0x10, R8
  3689  	ADDQ  CX, R8
  3690  
  3691  	// Load ctx.ofTable
  3692  	MOVQ ctx+16(FP), CX
  3693  	MOVQ 48(CX), CX
  3694  	MOVQ (CX)(R8*8), R8
  3695  
  3696  	// Update Match Length State
  3697  	BZHIQ DI, R14, CX
  3698  	SHRXQ DI, R14, R14
  3699  	SHRL  $0x10, DI
  3700  	ADDQ  CX, DI
  3701  
  3702  	// Load ctx.mlTable
  3703  	MOVQ ctx+16(FP), CX
  3704  	MOVQ 24(CX), CX
  3705  	MOVQ (CX)(DI*8), DI
  3706  
  3707  	// Update Literal Length State
  3708  	BZHIQ SI, R14, CX
  3709  	SHRL  $0x10, SI
  3710  	ADDQ  CX, SI
  3711  
  3712  	// Load ctx.llTable
  3713  	MOVQ ctx+16(FP), CX
  3714  	MOVQ (CX), CX
  3715  	MOVQ (CX)(SI*8), SI
  3716  
  3717  sequenceDecs_decodeSync_safe_bmi2_skip_update:
  3718  	// Adjust offset
  3719  	MOVQ   s+0(FP), CX
  3720  	MOVQ   8(SP), R13
  3721  	CMPQ   R12, $0x01
  3722  	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
  3723  	MOVUPS 144(CX), X0
  3724  	MOVQ   R13, 144(CX)
  3725  	MOVUPS X0, 152(CX)
  3726  	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3727  
  3728  sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
  3729  	CMPQ 24(SP), $0x00000000
  3730  	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
  3731  	INCQ R13
  3732  	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3733  
  3734  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
  3735  	TESTQ R13, R13
  3736  	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3737  	MOVQ  144(CX), R13
  3738  	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3739  
  3740  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
  3741  	MOVQ    R13, R12
  3742  	XORQ    R14, R14
  3743  	MOVQ    $-1, R15
  3744  	CMPQ    R13, $0x03
  3745  	CMOVQEQ R14, R12
  3746  	CMOVQEQ R15, R14
  3747  	ADDQ    144(CX)(R12*8), R14
  3748  	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
  3749  	MOVQ    $0x00000001, R14
  3750  
  3751  sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
  3752  	CMPQ R13, $0x01
  3753  	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
  3754  	MOVQ 152(CX), R12
  3755  	MOVQ R12, 160(CX)
  3756  
  3757  sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
  3758  	MOVQ 144(CX), R12
  3759  	MOVQ R12, 152(CX)
  3760  	MOVQ R14, 144(CX)
  3761  	MOVQ R14, R13
  3762  
  3763  sequenceDecs_decodeSync_safe_bmi2_after_adjust:
  3764  	MOVQ R13, 8(SP)
  3765  
  3766  	// Check values
  3767  	MOVQ  16(SP), CX
  3768  	MOVQ  24(SP), R12
  3769  	LEAQ  (CX)(R12*1), R14
  3770  	MOVQ  s+0(FP), R15
  3771  	ADDQ  R14, 256(R15)
  3772  	MOVQ  ctx+16(FP), R14
  3773  	SUBQ  R12, 104(R14)
  3774  	JS    error_not_enough_literals
  3775  	CMPQ  CX, $0x00020002
  3776  	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
  3777  	TESTQ R13, R13
  3778  	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
  3779  	TESTQ CX, CX
  3780  	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
  3781  
  3782  sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
  3783  	MOVQ 24(SP), CX
  3784  	MOVQ 8(SP), R12
  3785  	MOVQ 16(SP), R13
  3786  
  3787  	// Check if we have enough space in s.out
  3788  	LEAQ (CX)(R13*1), R14
  3789  	ADDQ R9, R14
  3790  	CMPQ R14, 32(SP)
  3791  	JA   error_not_enough_space
  3792  
  3793  	// Copy literals
  3794  	TESTQ CX, CX
  3795  	JZ    check_offset
  3796  	MOVQ  CX, R14
  3797  	SUBQ  $0x10, R14
  3798  	JB    copy_1_small
  3799  
  3800  copy_1_loop:
  3801  	MOVUPS (R10), X0
  3802  	MOVUPS X0, (R9)
  3803  	ADDQ   $0x10, R10
  3804  	ADDQ   $0x10, R9
  3805  	SUBQ   $0x10, R14
  3806  	JAE    copy_1_loop
  3807  	LEAQ   16(R10)(R14*1), R10
  3808  	LEAQ   16(R9)(R14*1), R9
  3809  	MOVUPS -16(R10), X0
  3810  	MOVUPS X0, -16(R9)
  3811  	JMP    copy_1_end
  3812  
  3813  copy_1_small:
  3814  	CMPQ CX, $0x03
  3815  	JE   copy_1_move_3
  3816  	JB   copy_1_move_1or2
  3817  	CMPQ CX, $0x08
  3818  	JB   copy_1_move_4through7
  3819  	JMP  copy_1_move_8through16
  3820  
  3821  copy_1_move_1or2:
  3822  	MOVB (R10), R14
  3823  	MOVB -1(R10)(CX*1), R15
  3824  	MOVB R14, (R9)
  3825  	MOVB R15, -1(R9)(CX*1)
  3826  	ADDQ CX, R10
  3827  	ADDQ CX, R9
  3828  	JMP  copy_1_end
  3829  
  3830  copy_1_move_3:
  3831  	MOVW (R10), R14
  3832  	MOVB 2(R10), R15
  3833  	MOVW R14, (R9)
  3834  	MOVB R15, 2(R9)
  3835  	ADDQ CX, R10
  3836  	ADDQ CX, R9
  3837  	JMP  copy_1_end
  3838  
  3839  copy_1_move_4through7:
  3840  	MOVL (R10), R14
  3841  	MOVL -4(R10)(CX*1), R15
  3842  	MOVL R14, (R9)
  3843  	MOVL R15, -4(R9)(CX*1)
  3844  	ADDQ CX, R10
  3845  	ADDQ CX, R9
  3846  	JMP  copy_1_end
  3847  
  3848  copy_1_move_8through16:
  3849  	MOVQ (R10), R14
  3850  	MOVQ -8(R10)(CX*1), R15
  3851  	MOVQ R14, (R9)
  3852  	MOVQ R15, -8(R9)(CX*1)
  3853  	ADDQ CX, R10
  3854  	ADDQ CX, R9
  3855  
  3856  copy_1_end:
  3857  	ADDQ CX, R11
  3858  
  3859  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3860  check_offset:
  3861  	MOVQ R11, CX
  3862  	ADDQ 40(SP), CX
  3863  	CMPQ R12, CX
  3864  	JG   error_match_off_too_big
  3865  	CMPQ R12, 56(SP)
  3866  	JG   error_match_off_too_big
  3867  
  3868  	// Copy match from history
  3869  	MOVQ R12, CX
  3870  	SUBQ R11, CX
  3871  	JLS  copy_match
  3872  	MOVQ 48(SP), R14
  3873  	SUBQ CX, R14
  3874  	CMPQ R13, CX
  3875  	JG   copy_all_from_history
  3876  	MOVQ R13, CX
  3877  	SUBQ $0x10, CX
  3878  	JB   copy_4_small
  3879  
  3880  copy_4_loop:
  3881  	MOVUPS (R14), X0
  3882  	MOVUPS X0, (R9)
  3883  	ADDQ   $0x10, R14
  3884  	ADDQ   $0x10, R9
  3885  	SUBQ   $0x10, CX
  3886  	JAE    copy_4_loop
  3887  	LEAQ   16(R14)(CX*1), R14
  3888  	LEAQ   16(R9)(CX*1), R9
  3889  	MOVUPS -16(R14), X0
  3890  	MOVUPS X0, -16(R9)
  3891  	JMP    copy_4_end
  3892  
  3893  copy_4_small:
  3894  	CMPQ R13, $0x03
  3895  	JE   copy_4_move_3
  3896  	CMPQ R13, $0x08
  3897  	JB   copy_4_move_4through7
  3898  	JMP  copy_4_move_8through16
  3899  
  3900  copy_4_move_3:
  3901  	MOVW (R14), CX
  3902  	MOVB 2(R14), R12
  3903  	MOVW CX, (R9)
  3904  	MOVB R12, 2(R9)
  3905  	ADDQ R13, R14
  3906  	ADDQ R13, R9
  3907  	JMP  copy_4_end
  3908  
  3909  copy_4_move_4through7:
  3910  	MOVL (R14), CX
  3911  	MOVL -4(R14)(R13*1), R12
  3912  	MOVL CX, (R9)
  3913  	MOVL R12, -4(R9)(R13*1)
  3914  	ADDQ R13, R14
  3915  	ADDQ R13, R9
  3916  	JMP  copy_4_end
  3917  
  3918  copy_4_move_8through16:
  3919  	MOVQ (R14), CX
  3920  	MOVQ -8(R14)(R13*1), R12
  3921  	MOVQ CX, (R9)
  3922  	MOVQ R12, -8(R9)(R13*1)
  3923  	ADDQ R13, R14
  3924  	ADDQ R13, R9
  3925  
  3926  copy_4_end:
  3927  	ADDQ R13, R11
  3928  	JMP  handle_loop
  3929  	JMP loop_finished
  3930  
  3931  copy_all_from_history:
  3932  	MOVQ CX, R15
  3933  	SUBQ $0x10, R15
  3934  	JB   copy_5_small
  3935  
  3936  copy_5_loop:
  3937  	MOVUPS (R14), X0
  3938  	MOVUPS X0, (R9)
  3939  	ADDQ   $0x10, R14
  3940  	ADDQ   $0x10, R9
  3941  	SUBQ   $0x10, R15
  3942  	JAE    copy_5_loop
  3943  	LEAQ   16(R14)(R15*1), R14
  3944  	LEAQ   16(R9)(R15*1), R9
  3945  	MOVUPS -16(R14), X0
  3946  	MOVUPS X0, -16(R9)
  3947  	JMP    copy_5_end
  3948  
  3949  copy_5_small:
  3950  	CMPQ CX, $0x03
  3951  	JE   copy_5_move_3
  3952  	JB   copy_5_move_1or2
  3953  	CMPQ CX, $0x08
  3954  	JB   copy_5_move_4through7
  3955  	JMP  copy_5_move_8through16
  3956  
  3957  copy_5_move_1or2:
  3958  	MOVB (R14), R15
  3959  	MOVB -1(R14)(CX*1), BP
  3960  	MOVB R15, (R9)
  3961  	MOVB BP, -1(R9)(CX*1)
  3962  	ADDQ CX, R14
  3963  	ADDQ CX, R9
  3964  	JMP  copy_5_end
  3965  
  3966  copy_5_move_3:
  3967  	MOVW (R14), R15
  3968  	MOVB 2(R14), BP
  3969  	MOVW R15, (R9)
  3970  	MOVB BP, 2(R9)
  3971  	ADDQ CX, R14
  3972  	ADDQ CX, R9
  3973  	JMP  copy_5_end
  3974  
  3975  copy_5_move_4through7:
  3976  	MOVL (R14), R15
  3977  	MOVL -4(R14)(CX*1), BP
  3978  	MOVL R15, (R9)
  3979  	MOVL BP, -4(R9)(CX*1)
  3980  	ADDQ CX, R14
  3981  	ADDQ CX, R9
  3982  	JMP  copy_5_end
  3983  
  3984  copy_5_move_8through16:
  3985  	MOVQ (R14), R15
  3986  	MOVQ -8(R14)(CX*1), BP
  3987  	MOVQ R15, (R9)
  3988  	MOVQ BP, -8(R9)(CX*1)
  3989  	ADDQ CX, R14
  3990  	ADDQ CX, R9
  3991  
  3992  copy_5_end:
  3993  	ADDQ CX, R11
  3994  	SUBQ CX, R13
  3995  
  3996  	// Copy match from the current buffer
  3997  copy_match:
  3998  	MOVQ R9, CX
  3999  	SUBQ R12, CX
  4000  
  4001  	// ml <= mo
  4002  	CMPQ R13, R12
  4003  	JA   copy_overlapping_match
  4004  
  4005  	// Copy non-overlapping match
  4006  	ADDQ R13, R11
  4007  	MOVQ R13, R12
  4008  	SUBQ $0x10, R12
  4009  	JB   copy_2_small
  4010  
  4011  copy_2_loop:
  4012  	MOVUPS (CX), X0
  4013  	MOVUPS X0, (R9)
  4014  	ADDQ   $0x10, CX
  4015  	ADDQ   $0x10, R9
  4016  	SUBQ   $0x10, R12
  4017  	JAE    copy_2_loop
  4018  	LEAQ   16(CX)(R12*1), CX
  4019  	LEAQ   16(R9)(R12*1), R9
  4020  	MOVUPS -16(CX), X0
  4021  	MOVUPS X0, -16(R9)
  4022  	JMP    copy_2_end
  4023  
  4024  copy_2_small:
  4025  	CMPQ R13, $0x03
  4026  	JE   copy_2_move_3
  4027  	JB   copy_2_move_1or2
  4028  	CMPQ R13, $0x08
  4029  	JB   copy_2_move_4through7
  4030  	JMP  copy_2_move_8through16
  4031  
  4032  copy_2_move_1or2:
  4033  	MOVB (CX), R12
  4034  	MOVB -1(CX)(R13*1), R14
  4035  	MOVB R12, (R9)
  4036  	MOVB R14, -1(R9)(R13*1)
  4037  	ADDQ R13, CX
  4038  	ADDQ R13, R9
  4039  	JMP  copy_2_end
  4040  
  4041  copy_2_move_3:
  4042  	MOVW (CX), R12
  4043  	MOVB 2(CX), R14
  4044  	MOVW R12, (R9)
  4045  	MOVB R14, 2(R9)
  4046  	ADDQ R13, CX
  4047  	ADDQ R13, R9
  4048  	JMP  copy_2_end
  4049  
  4050  copy_2_move_4through7:
  4051  	MOVL (CX), R12
  4052  	MOVL -4(CX)(R13*1), R14
  4053  	MOVL R12, (R9)
  4054  	MOVL R14, -4(R9)(R13*1)
  4055  	ADDQ R13, CX
  4056  	ADDQ R13, R9
  4057  	JMP  copy_2_end
  4058  
  4059  copy_2_move_8through16:
  4060  	MOVQ (CX), R12
  4061  	MOVQ -8(CX)(R13*1), R14
  4062  	MOVQ R12, (R9)
  4063  	MOVQ R14, -8(R9)(R13*1)
  4064  	ADDQ R13, CX
  4065  	ADDQ R13, R9
  4066  
  4067  copy_2_end:
  4068  	JMP handle_loop
  4069  
  4070  	// Copy overlapping match
  4071  copy_overlapping_match:
  4072  	ADDQ R13, R11
  4073  
  4074  copy_slow_3:
  4075  	MOVB (CX), R12
  4076  	MOVB R12, (R9)
  4077  	INCQ CX
  4078  	INCQ R9
  4079  	DECQ R13
  4080  	JNZ  copy_slow_3
  4081  
  4082  handle_loop:
  4083  	MOVQ ctx+16(FP), CX
  4084  	DECQ 96(CX)
  4085  	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
  4086  
  4087  loop_finished:
  4088  	MOVQ br+8(FP), CX
  4089  	MOVQ AX, 24(CX)
  4090  	MOVB DL, 32(CX)
  4091  	MOVQ BX, 8(CX)
  4092  
  4093  	// Update the context
  4094  	MOVQ ctx+16(FP), AX
  4095  	MOVQ R11, 136(AX)
  4096  	MOVQ 144(AX), CX
  4097  	SUBQ CX, R10
  4098  	MOVQ R10, 168(AX)
  4099  
  4100  	// Return success
  4101  	MOVQ $0x00000000, ret+24(FP)
  4102  	RET
  4103  
  4104  	// Return with match length error
  4105  sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
  4106  	MOVQ 16(SP), AX
  4107  	MOVQ ctx+16(FP), CX
  4108  	MOVQ AX, 216(CX)
  4109  	MOVQ $0x00000001, ret+24(FP)
  4110  	RET
  4111  
  4112  	// Return with match too long error
  4113  sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
  4114  	MOVQ ctx+16(FP), AX
  4115  	MOVQ 16(SP), CX
  4116  	MOVQ CX, 216(AX)
  4117  	MOVQ $0x00000002, ret+24(FP)
  4118  	RET
  4119  
  4120  	// Return with match offset too long error
  4121  error_match_off_too_big:
  4122  	MOVQ ctx+16(FP), AX
  4123  	MOVQ 8(SP), CX
  4124  	MOVQ CX, 224(AX)
  4125  	MOVQ R11, 136(AX)
  4126  	MOVQ $0x00000003, ret+24(FP)
  4127  	RET
  4128  
  4129  	// Return with not enough literals error
  4130  error_not_enough_literals:
  4131  	MOVQ ctx+16(FP), AX
  4132  	MOVQ 24(SP), CX
  4133  	MOVQ CX, 208(AX)
  4134  	MOVQ $0x00000004, ret+24(FP)
  4135  	RET
  4136  
  4137  	// Return with overread error
  4138  error_overread:
  4139  	MOVQ $0x00000006, ret+24(FP)
  4140  	RET
  4141  
  4142  	// Return with not enough output space error
  4143  error_not_enough_space:
  4144  	MOVQ ctx+16(FP), AX
  4145  	MOVQ 24(SP), CX
  4146  	MOVQ CX, 208(AX)
  4147  	MOVQ 16(SP), CX
  4148  	MOVQ CX, 216(AX)
  4149  	MOVQ R11, 136(AX)
  4150  	MOVQ $0x00000005, ret+24(FP)
  4151  	RET