github.com/bir3/gocompiler@v0.9.2202/extra/compress/zstd/seqdec_amd64.s (about)

     1  // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
     2  
     3  //go:build !appengine && !noasm && gc && !noasm
     4  
     5  // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
     6  // Requires: CMOV
     7  TEXT ·sequenceDecs_decode_amd64(SB), $8-32
     8  	MOVQ    br+8(FP), AX
     9  	MOVQ    32(AX), DX
    10  	MOVBQZX 40(AX), BX
    11  	MOVQ    24(AX), SI
    12  	MOVQ    (AX), AX
    13  	ADDQ    SI, AX
    14  	MOVQ    AX, (SP)
    15  	MOVQ    ctx+16(FP), AX
    16  	MOVQ    72(AX), DI
    17  	MOVQ    80(AX), R8
    18  	MOVQ    88(AX), R9
    19  	MOVQ    104(AX), R10
    20  	MOVQ    s+0(FP), AX
    21  	MOVQ    144(AX), R11
    22  	MOVQ    152(AX), R12
    23  	MOVQ    160(AX), R13
    24  
    25  sequenceDecs_decode_amd64_main_loop:
    26  	MOVQ (SP), R14
    27  
    28  	// Fill bitreader to have enough for the offset and match length.
    29  	CMPQ SI, $0x08
    30  	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
    31  	MOVQ BX, AX
    32  	SHRQ $0x03, AX
    33  	SUBQ AX, R14
    34  	MOVQ (R14), DX
    35  	SUBQ AX, SI
    36  	ANDQ $0x07, BX
    37  	JMP  sequenceDecs_decode_amd64_fill_end
    38  
    39  sequenceDecs_decode_amd64_fill_byte_by_byte:
    40  	CMPQ    SI, $0x00
    41  	JLE     sequenceDecs_decode_amd64_fill_check_overread
    42  	CMPQ    BX, $0x07
    43  	JLE     sequenceDecs_decode_amd64_fill_end
    44  	SHLQ    $0x08, DX
    45  	SUBQ    $0x01, R14
    46  	SUBQ    $0x01, SI
    47  	SUBQ    $0x08, BX
    48  	MOVBQZX (R14), AX
    49  	ORQ     AX, DX
    50  	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
    51  
    52  sequenceDecs_decode_amd64_fill_check_overread:
    53  	CMPQ BX, $0x40
    54  	JA   error_overread
    55  
    56  sequenceDecs_decode_amd64_fill_end:
    57  	// Update offset
    58  	MOVQ  R9, AX
    59  	MOVQ  BX, CX
    60  	MOVQ  DX, R15
    61  	SHLQ  CL, R15
    62  	MOVB  AH, CL
    63  	SHRQ  $0x20, AX
    64  	TESTQ CX, CX
    65  	JZ    sequenceDecs_decode_amd64_of_update_zero
    66  	ADDQ  CX, BX
    67  	CMPQ  BX, $0x40
    68  	JA    sequenceDecs_decode_amd64_of_update_zero
    69  	CMPQ  CX, $0x40
    70  	JAE   sequenceDecs_decode_amd64_of_update_zero
    71  	NEGQ  CX
    72  	SHRQ  CL, R15
    73  	ADDQ  R15, AX
    74  
    75  sequenceDecs_decode_amd64_of_update_zero:
    76  	MOVQ AX, 16(R10)
    77  
    78  	// Update match length
    79  	MOVQ  R8, AX
    80  	MOVQ  BX, CX
    81  	MOVQ  DX, R15
    82  	SHLQ  CL, R15
    83  	MOVB  AH, CL
    84  	SHRQ  $0x20, AX
    85  	TESTQ CX, CX
    86  	JZ    sequenceDecs_decode_amd64_ml_update_zero
    87  	ADDQ  CX, BX
    88  	CMPQ  BX, $0x40
    89  	JA    sequenceDecs_decode_amd64_ml_update_zero
    90  	CMPQ  CX, $0x40
    91  	JAE   sequenceDecs_decode_amd64_ml_update_zero
    92  	NEGQ  CX
    93  	SHRQ  CL, R15
    94  	ADDQ  R15, AX
    95  
    96  sequenceDecs_decode_amd64_ml_update_zero:
    97  	MOVQ AX, 8(R10)
    98  
    99  	// Fill bitreader to have enough for the remaining
   100  	CMPQ SI, $0x08
   101  	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
   102  	MOVQ BX, AX
   103  	SHRQ $0x03, AX
   104  	SUBQ AX, R14
   105  	MOVQ (R14), DX
   106  	SUBQ AX, SI
   107  	ANDQ $0x07, BX
   108  	JMP  sequenceDecs_decode_amd64_fill_2_end
   109  
   110  sequenceDecs_decode_amd64_fill_2_byte_by_byte:
   111  	CMPQ    SI, $0x00
   112  	JLE     sequenceDecs_decode_amd64_fill_2_check_overread
   113  	CMPQ    BX, $0x07
   114  	JLE     sequenceDecs_decode_amd64_fill_2_end
   115  	SHLQ    $0x08, DX
   116  	SUBQ    $0x01, R14
   117  	SUBQ    $0x01, SI
   118  	SUBQ    $0x08, BX
   119  	MOVBQZX (R14), AX
   120  	ORQ     AX, DX
   121  	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
   122  
   123  sequenceDecs_decode_amd64_fill_2_check_overread:
   124  	CMPQ BX, $0x40
   125  	JA   error_overread
   126  
   127  sequenceDecs_decode_amd64_fill_2_end:
   128  	// Update literal length
   129  	MOVQ  DI, AX
   130  	MOVQ  BX, CX
   131  	MOVQ  DX, R15
   132  	SHLQ  CL, R15
   133  	MOVB  AH, CL
   134  	SHRQ  $0x20, AX
   135  	TESTQ CX, CX
   136  	JZ    sequenceDecs_decode_amd64_ll_update_zero
   137  	ADDQ  CX, BX
   138  	CMPQ  BX, $0x40
   139  	JA    sequenceDecs_decode_amd64_ll_update_zero
   140  	CMPQ  CX, $0x40
   141  	JAE   sequenceDecs_decode_amd64_ll_update_zero
   142  	NEGQ  CX
   143  	SHRQ  CL, R15
   144  	ADDQ  R15, AX
   145  
   146  sequenceDecs_decode_amd64_ll_update_zero:
   147  	MOVQ AX, (R10)
   148  
   149  	// Fill bitreader for state updates
   150  	MOVQ    R14, (SP)
   151  	MOVQ    R9, AX
   152  	SHRQ    $0x08, AX
   153  	MOVBQZX AL, AX
   154  	MOVQ    ctx+16(FP), CX
   155  	CMPQ    96(CX), $0x00
   156  	JZ      sequenceDecs_decode_amd64_skip_update
   157  
   158  	// Update Literal Length State
   159  	MOVBQZX DI, R14
   160  	SHRQ    $0x10, DI
   161  	MOVWQZX DI, DI
   162  	LEAQ    (BX)(R14*1), CX
   163  	MOVQ    DX, R15
   164  	MOVQ    CX, BX
   165  	ROLQ    CL, R15
   166  	MOVL    $0x00000001, BP
   167  	MOVB    R14, CL
   168  	SHLL    CL, BP
   169  	DECL    BP
   170  	ANDQ    BP, R15
   171  	ADDQ    R15, DI
   172  
   173  	// Load ctx.llTable
   174  	MOVQ ctx+16(FP), CX
   175  	MOVQ (CX), CX
   176  	MOVQ (CX)(DI*8), DI
   177  
   178  	// Update Match Length State
   179  	MOVBQZX R8, R14
   180  	SHRQ    $0x10, R8
   181  	MOVWQZX R8, R8
   182  	LEAQ    (BX)(R14*1), CX
   183  	MOVQ    DX, R15
   184  	MOVQ    CX, BX
   185  	ROLQ    CL, R15
   186  	MOVL    $0x00000001, BP
   187  	MOVB    R14, CL
   188  	SHLL    CL, BP
   189  	DECL    BP
   190  	ANDQ    BP, R15
   191  	ADDQ    R15, R8
   192  
   193  	// Load ctx.mlTable
   194  	MOVQ ctx+16(FP), CX
   195  	MOVQ 24(CX), CX
   196  	MOVQ (CX)(R8*8), R8
   197  
   198  	// Update Offset State
   199  	MOVBQZX R9, R14
   200  	SHRQ    $0x10, R9
   201  	MOVWQZX R9, R9
   202  	LEAQ    (BX)(R14*1), CX
   203  	MOVQ    DX, R15
   204  	MOVQ    CX, BX
   205  	ROLQ    CL, R15
   206  	MOVL    $0x00000001, BP
   207  	MOVB    R14, CL
   208  	SHLL    CL, BP
   209  	DECL    BP
   210  	ANDQ    BP, R15
   211  	ADDQ    R15, R9
   212  
   213  	// Load ctx.ofTable
   214  	MOVQ ctx+16(FP), CX
   215  	MOVQ 48(CX), CX
   216  	MOVQ (CX)(R9*8), R9
   217  
   218  sequenceDecs_decode_amd64_skip_update:
   219  	// Adjust offset
   220  	MOVQ 16(R10), CX
   221  	CMPQ AX, $0x01
   222  	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
   223  	MOVQ R12, R13
   224  	MOVQ R11, R12
   225  	MOVQ CX, R11
   226  	JMP  sequenceDecs_decode_amd64_after_adjust
   227  
   228  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
   229  	CMPQ (R10), $0x00000000
   230  	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
   231  	INCQ CX
   232  	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
   233  
   234  sequenceDecs_decode_amd64_adjust_offset_maybezero:
   235  	TESTQ CX, CX
   236  	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
   237  	MOVQ  R11, CX
   238  	JMP   sequenceDecs_decode_amd64_after_adjust
   239  
   240  sequenceDecs_decode_amd64_adjust_offset_nonzero:
   241  	CMPQ CX, $0x01
   242  	JB   sequenceDecs_decode_amd64_adjust_zero
   243  	JEQ  sequenceDecs_decode_amd64_adjust_one
   244  	CMPQ CX, $0x02
   245  	JA   sequenceDecs_decode_amd64_adjust_three
   246  	JMP  sequenceDecs_decode_amd64_adjust_two
   247  
   248  sequenceDecs_decode_amd64_adjust_zero:
   249  	MOVQ R11, AX
   250  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   251  
   252  sequenceDecs_decode_amd64_adjust_one:
   253  	MOVQ R12, AX
   254  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   255  
   256  sequenceDecs_decode_amd64_adjust_two:
   257  	MOVQ R13, AX
   258  	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   259  
   260  sequenceDecs_decode_amd64_adjust_three:
   261  	LEAQ -1(R11), AX
   262  
   263  sequenceDecs_decode_amd64_adjust_test_temp_valid:
   264  	TESTQ AX, AX
   265  	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
   266  	MOVQ  $0x00000001, AX
   267  
   268  sequenceDecs_decode_amd64_adjust_temp_valid:
   269  	CMPQ    CX, $0x01
   270  	CMOVQNE R12, R13
   271  	MOVQ    R11, R12
   272  	MOVQ    AX, R11
   273  	MOVQ    AX, CX
   274  
   275  sequenceDecs_decode_amd64_after_adjust:
   276  	MOVQ CX, 16(R10)
   277  
   278  	// Check values
   279  	MOVQ  8(R10), AX
   280  	MOVQ  (R10), R14
   281  	LEAQ  (AX)(R14*1), R15
   282  	MOVQ  s+0(FP), BP
   283  	ADDQ  R15, 256(BP)
   284  	MOVQ  ctx+16(FP), R15
   285  	SUBQ  R14, 128(R15)
   286  	JS    error_not_enough_literals
   287  	CMPQ  AX, $0x00020002
   288  	JA    sequenceDecs_decode_amd64_error_match_len_too_big
   289  	TESTQ CX, CX
   290  	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
   291  	TESTQ AX, AX
   292  	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
   293  
   294  sequenceDecs_decode_amd64_match_len_ofs_ok:
   295  	ADDQ $0x18, R10
   296  	MOVQ ctx+16(FP), AX
   297  	DECQ 96(AX)
   298  	JNS  sequenceDecs_decode_amd64_main_loop
   299  	MOVQ s+0(FP), AX
   300  	MOVQ R11, 144(AX)
   301  	MOVQ R12, 152(AX)
   302  	MOVQ R13, 160(AX)
   303  	MOVQ br+8(FP), AX
   304  	MOVQ DX, 32(AX)
   305  	MOVB BL, 40(AX)
   306  	MOVQ SI, 24(AX)
   307  
   308  	// Return success
   309  	MOVQ $0x00000000, ret+24(FP)
   310  	RET
   311  
   312  	// Return with match length error
   313  sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
   314  	MOVQ $0x00000001, ret+24(FP)
   315  	RET
   316  
   317  	// Return with match too long error
   318  sequenceDecs_decode_amd64_error_match_len_too_big:
   319  	MOVQ $0x00000002, ret+24(FP)
   320  	RET
   321  
   322  	// Return with match offset too long error
   323  	MOVQ $0x00000003, ret+24(FP)
   324  	RET
   325  
   326  	// Return with not enough literals error
   327  error_not_enough_literals:
   328  	MOVQ $0x00000004, ret+24(FP)
   329  	RET
   330  
   331  	// Return with overread error
   332  error_overread:
   333  	MOVQ $0x00000006, ret+24(FP)
   334  	RET
   335  
   336  // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   337  // Requires: CMOV
   338  TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
   339  	MOVQ    br+8(FP), AX
   340  	MOVQ    32(AX), DX
   341  	MOVBQZX 40(AX), BX
   342  	MOVQ    24(AX), SI
   343  	MOVQ    (AX), AX
   344  	ADDQ    SI, AX
   345  	MOVQ    AX, (SP)
   346  	MOVQ    ctx+16(FP), AX
   347  	MOVQ    72(AX), DI
   348  	MOVQ    80(AX), R8
   349  	MOVQ    88(AX), R9
   350  	MOVQ    104(AX), R10
   351  	MOVQ    s+0(FP), AX
   352  	MOVQ    144(AX), R11
   353  	MOVQ    152(AX), R12
   354  	MOVQ    160(AX), R13
   355  
   356  sequenceDecs_decode_56_amd64_main_loop:
   357  	MOVQ (SP), R14
   358  
   359  	// Fill bitreader to have enough for the offset and match length.
   360  	CMPQ SI, $0x08
   361  	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
   362  	MOVQ BX, AX
   363  	SHRQ $0x03, AX
   364  	SUBQ AX, R14
   365  	MOVQ (R14), DX
   366  	SUBQ AX, SI
   367  	ANDQ $0x07, BX
   368  	JMP  sequenceDecs_decode_56_amd64_fill_end
   369  
   370  sequenceDecs_decode_56_amd64_fill_byte_by_byte:
   371  	CMPQ    SI, $0x00
   372  	JLE     sequenceDecs_decode_56_amd64_fill_check_overread
   373  	CMPQ    BX, $0x07
   374  	JLE     sequenceDecs_decode_56_amd64_fill_end
   375  	SHLQ    $0x08, DX
   376  	SUBQ    $0x01, R14
   377  	SUBQ    $0x01, SI
   378  	SUBQ    $0x08, BX
   379  	MOVBQZX (R14), AX
   380  	ORQ     AX, DX
   381  	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
   382  
   383  sequenceDecs_decode_56_amd64_fill_check_overread:
   384  	CMPQ BX, $0x40
   385  	JA   error_overread
   386  
   387  sequenceDecs_decode_56_amd64_fill_end:
   388  	// Update offset
   389  	MOVQ  R9, AX
   390  	MOVQ  BX, CX
   391  	MOVQ  DX, R15
   392  	SHLQ  CL, R15
   393  	MOVB  AH, CL
   394  	SHRQ  $0x20, AX
   395  	TESTQ CX, CX
   396  	JZ    sequenceDecs_decode_56_amd64_of_update_zero
   397  	ADDQ  CX, BX
   398  	CMPQ  BX, $0x40
   399  	JA    sequenceDecs_decode_56_amd64_of_update_zero
   400  	CMPQ  CX, $0x40
   401  	JAE   sequenceDecs_decode_56_amd64_of_update_zero
   402  	NEGQ  CX
   403  	SHRQ  CL, R15
   404  	ADDQ  R15, AX
   405  
   406  sequenceDecs_decode_56_amd64_of_update_zero:
   407  	MOVQ AX, 16(R10)
   408  
   409  	// Update match length
   410  	MOVQ  R8, AX
   411  	MOVQ  BX, CX
   412  	MOVQ  DX, R15
   413  	SHLQ  CL, R15
   414  	MOVB  AH, CL
   415  	SHRQ  $0x20, AX
   416  	TESTQ CX, CX
   417  	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
   418  	ADDQ  CX, BX
   419  	CMPQ  BX, $0x40
   420  	JA    sequenceDecs_decode_56_amd64_ml_update_zero
   421  	CMPQ  CX, $0x40
   422  	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
   423  	NEGQ  CX
   424  	SHRQ  CL, R15
   425  	ADDQ  R15, AX
   426  
   427  sequenceDecs_decode_56_amd64_ml_update_zero:
   428  	MOVQ AX, 8(R10)
   429  
   430  	// Update literal length
   431  	MOVQ  DI, AX
   432  	MOVQ  BX, CX
   433  	MOVQ  DX, R15
   434  	SHLQ  CL, R15
   435  	MOVB  AH, CL
   436  	SHRQ  $0x20, AX
   437  	TESTQ CX, CX
   438  	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
   439  	ADDQ  CX, BX
   440  	CMPQ  BX, $0x40
   441  	JA    sequenceDecs_decode_56_amd64_ll_update_zero
   442  	CMPQ  CX, $0x40
   443  	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
   444  	NEGQ  CX
   445  	SHRQ  CL, R15
   446  	ADDQ  R15, AX
   447  
   448  sequenceDecs_decode_56_amd64_ll_update_zero:
   449  	MOVQ AX, (R10)
   450  
   451  	// Fill bitreader for state updates
   452  	MOVQ    R14, (SP)
   453  	MOVQ    R9, AX
   454  	SHRQ    $0x08, AX
   455  	MOVBQZX AL, AX
   456  	MOVQ    ctx+16(FP), CX
   457  	CMPQ    96(CX), $0x00
   458  	JZ      sequenceDecs_decode_56_amd64_skip_update
   459  
   460  	// Update Literal Length State
   461  	MOVBQZX DI, R14
   462  	SHRQ    $0x10, DI
   463  	MOVWQZX DI, DI
   464  	LEAQ    (BX)(R14*1), CX
   465  	MOVQ    DX, R15
   466  	MOVQ    CX, BX
   467  	ROLQ    CL, R15
   468  	MOVL    $0x00000001, BP
   469  	MOVB    R14, CL
   470  	SHLL    CL, BP
   471  	DECL    BP
   472  	ANDQ    BP, R15
   473  	ADDQ    R15, DI
   474  
   475  	// Load ctx.llTable
   476  	MOVQ ctx+16(FP), CX
   477  	MOVQ (CX), CX
   478  	MOVQ (CX)(DI*8), DI
   479  
   480  	// Update Match Length State
   481  	MOVBQZX R8, R14
   482  	SHRQ    $0x10, R8
   483  	MOVWQZX R8, R8
   484  	LEAQ    (BX)(R14*1), CX
   485  	MOVQ    DX, R15
   486  	MOVQ    CX, BX
   487  	ROLQ    CL, R15
   488  	MOVL    $0x00000001, BP
   489  	MOVB    R14, CL
   490  	SHLL    CL, BP
   491  	DECL    BP
   492  	ANDQ    BP, R15
   493  	ADDQ    R15, R8
   494  
   495  	// Load ctx.mlTable
   496  	MOVQ ctx+16(FP), CX
   497  	MOVQ 24(CX), CX
   498  	MOVQ (CX)(R8*8), R8
   499  
   500  	// Update Offset State
   501  	MOVBQZX R9, R14
   502  	SHRQ    $0x10, R9
   503  	MOVWQZX R9, R9
   504  	LEAQ    (BX)(R14*1), CX
   505  	MOVQ    DX, R15
   506  	MOVQ    CX, BX
   507  	ROLQ    CL, R15
   508  	MOVL    $0x00000001, BP
   509  	MOVB    R14, CL
   510  	SHLL    CL, BP
   511  	DECL    BP
   512  	ANDQ    BP, R15
   513  	ADDQ    R15, R9
   514  
   515  	// Load ctx.ofTable
   516  	MOVQ ctx+16(FP), CX
   517  	MOVQ 48(CX), CX
   518  	MOVQ (CX)(R9*8), R9
   519  
   520  sequenceDecs_decode_56_amd64_skip_update:
   521  	// Adjust offset
   522  	MOVQ 16(R10), CX
   523  	CMPQ AX, $0x01
   524  	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
   525  	MOVQ R12, R13
   526  	MOVQ R11, R12
   527  	MOVQ CX, R11
   528  	JMP  sequenceDecs_decode_56_amd64_after_adjust
   529  
   530  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
   531  	CMPQ (R10), $0x00000000
   532  	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
   533  	INCQ CX
   534  	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
   535  
   536  sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
   537  	TESTQ CX, CX
   538  	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
   539  	MOVQ  R11, CX
   540  	JMP   sequenceDecs_decode_56_amd64_after_adjust
   541  
   542  sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
   543  	CMPQ CX, $0x01
   544  	JB   sequenceDecs_decode_56_amd64_adjust_zero
   545  	JEQ  sequenceDecs_decode_56_amd64_adjust_one
   546  	CMPQ CX, $0x02
   547  	JA   sequenceDecs_decode_56_amd64_adjust_three
   548  	JMP  sequenceDecs_decode_56_amd64_adjust_two
   549  
   550  sequenceDecs_decode_56_amd64_adjust_zero:
   551  	MOVQ R11, AX
   552  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   553  
   554  sequenceDecs_decode_56_amd64_adjust_one:
   555  	MOVQ R12, AX
   556  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   557  
   558  sequenceDecs_decode_56_amd64_adjust_two:
   559  	MOVQ R13, AX
   560  	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   561  
   562  sequenceDecs_decode_56_amd64_adjust_three:
   563  	LEAQ -1(R11), AX
   564  
   565  sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
   566  	TESTQ AX, AX
   567  	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
   568  	MOVQ  $0x00000001, AX
   569  
   570  sequenceDecs_decode_56_amd64_adjust_temp_valid:
   571  	CMPQ    CX, $0x01
   572  	CMOVQNE R12, R13
   573  	MOVQ    R11, R12
   574  	MOVQ    AX, R11
   575  	MOVQ    AX, CX
   576  
   577  sequenceDecs_decode_56_amd64_after_adjust:
   578  	MOVQ CX, 16(R10)
   579  
   580  	// Check values
   581  	MOVQ  8(R10), AX
   582  	MOVQ  (R10), R14
   583  	LEAQ  (AX)(R14*1), R15
   584  	MOVQ  s+0(FP), BP
   585  	ADDQ  R15, 256(BP)
   586  	MOVQ  ctx+16(FP), R15
   587  	SUBQ  R14, 128(R15)
   588  	JS    error_not_enough_literals
   589  	CMPQ  AX, $0x00020002
   590  	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
   591  	TESTQ CX, CX
   592  	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
   593  	TESTQ AX, AX
   594  	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
   595  
   596  sequenceDecs_decode_56_amd64_match_len_ofs_ok:
   597  	ADDQ $0x18, R10
   598  	MOVQ ctx+16(FP), AX
   599  	DECQ 96(AX)
   600  	JNS  sequenceDecs_decode_56_amd64_main_loop
   601  	MOVQ s+0(FP), AX
   602  	MOVQ R11, 144(AX)
   603  	MOVQ R12, 152(AX)
   604  	MOVQ R13, 160(AX)
   605  	MOVQ br+8(FP), AX
   606  	MOVQ DX, 32(AX)
   607  	MOVB BL, 40(AX)
   608  	MOVQ SI, 24(AX)
   609  
   610  	// Return success
   611  	MOVQ $0x00000000, ret+24(FP)
   612  	RET
   613  
   614  	// Return with match length error
   615  sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
   616  	MOVQ $0x00000001, ret+24(FP)
   617  	RET
   618  
   619  	// Return with match too long error
   620  sequenceDecs_decode_56_amd64_error_match_len_too_big:
   621  	MOVQ $0x00000002, ret+24(FP)
   622  	RET
   623  
   624  	// Return with match offset too long error
   625  	MOVQ $0x00000003, ret+24(FP)
   626  	RET
   627  
   628  	// Return with not enough literals error
   629  error_not_enough_literals:
   630  	MOVQ $0x00000004, ret+24(FP)
   631  	RET
   632  
   633  	// Return with overread error
   634  error_overread:
   635  	MOVQ $0x00000006, ret+24(FP)
   636  	RET
   637  
   638  // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   639  // Requires: BMI, BMI2, CMOV
   640  TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
   641  	MOVQ    br+8(FP), CX
   642  	MOVQ    32(CX), AX
   643  	MOVBQZX 40(CX), DX
   644  	MOVQ    24(CX), BX
   645  	MOVQ    (CX), CX
   646  	ADDQ    BX, CX
   647  	MOVQ    CX, (SP)
   648  	MOVQ    ctx+16(FP), CX
   649  	MOVQ    72(CX), SI
   650  	MOVQ    80(CX), DI
   651  	MOVQ    88(CX), R8
   652  	MOVQ    104(CX), R9
   653  	MOVQ    s+0(FP), CX
   654  	MOVQ    144(CX), R10
   655  	MOVQ    152(CX), R11
   656  	MOVQ    160(CX), R12
   657  
   658  sequenceDecs_decode_bmi2_main_loop:
   659  	MOVQ (SP), R13
   660  
   661  	// Fill bitreader to have enough for the offset and match length.
   662  	CMPQ BX, $0x08
   663  	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
   664  	MOVQ DX, CX
   665  	SHRQ $0x03, CX
   666  	SUBQ CX, R13
   667  	MOVQ (R13), AX
   668  	SUBQ CX, BX
   669  	ANDQ $0x07, DX
   670  	JMP  sequenceDecs_decode_bmi2_fill_end
   671  
   672  sequenceDecs_decode_bmi2_fill_byte_by_byte:
   673  	CMPQ    BX, $0x00
   674  	JLE     sequenceDecs_decode_bmi2_fill_check_overread
   675  	CMPQ    DX, $0x07
   676  	JLE     sequenceDecs_decode_bmi2_fill_end
   677  	SHLQ    $0x08, AX
   678  	SUBQ    $0x01, R13
   679  	SUBQ    $0x01, BX
   680  	SUBQ    $0x08, DX
   681  	MOVBQZX (R13), CX
   682  	ORQ     CX, AX
   683  	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
   684  
   685  sequenceDecs_decode_bmi2_fill_check_overread:
   686  	CMPQ DX, $0x40
   687  	JA   error_overread
   688  
   689  sequenceDecs_decode_bmi2_fill_end:
   690  	// Update offset
   691  	MOVQ   $0x00000808, CX
   692  	BEXTRQ CX, R8, R14
   693  	MOVQ   AX, R15
   694  	LEAQ   (DX)(R14*1), CX
   695  	ROLQ   CL, R15
   696  	BZHIQ  R14, R15, R15
   697  	MOVQ   CX, DX
   698  	MOVQ   R8, CX
   699  	SHRQ   $0x20, CX
   700  	ADDQ   R15, CX
   701  	MOVQ   CX, 16(R9)
   702  
   703  	// Update match length
   704  	MOVQ   $0x00000808, CX
   705  	BEXTRQ CX, DI, R14
   706  	MOVQ   AX, R15
   707  	LEAQ   (DX)(R14*1), CX
   708  	ROLQ   CL, R15
   709  	BZHIQ  R14, R15, R15
   710  	MOVQ   CX, DX
   711  	MOVQ   DI, CX
   712  	SHRQ   $0x20, CX
   713  	ADDQ   R15, CX
   714  	MOVQ   CX, 8(R9)
   715  
   716  	// Fill bitreader to have enough for the remaining
   717  	CMPQ BX, $0x08
   718  	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
   719  	MOVQ DX, CX
   720  	SHRQ $0x03, CX
   721  	SUBQ CX, R13
   722  	MOVQ (R13), AX
   723  	SUBQ CX, BX
   724  	ANDQ $0x07, DX
   725  	JMP  sequenceDecs_decode_bmi2_fill_2_end
   726  
   727  sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
   728  	CMPQ    BX, $0x00
   729  	JLE     sequenceDecs_decode_bmi2_fill_2_check_overread
   730  	CMPQ    DX, $0x07
   731  	JLE     sequenceDecs_decode_bmi2_fill_2_end
   732  	SHLQ    $0x08, AX
   733  	SUBQ    $0x01, R13
   734  	SUBQ    $0x01, BX
   735  	SUBQ    $0x08, DX
   736  	MOVBQZX (R13), CX
   737  	ORQ     CX, AX
   738  	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
   739  
   740  sequenceDecs_decode_bmi2_fill_2_check_overread:
   741  	CMPQ DX, $0x40
   742  	JA   error_overread
   743  
   744  sequenceDecs_decode_bmi2_fill_2_end:
   745  	// Update literal length
   746  	MOVQ   $0x00000808, CX
   747  	BEXTRQ CX, SI, R14
   748  	MOVQ   AX, R15
   749  	LEAQ   (DX)(R14*1), CX
   750  	ROLQ   CL, R15
   751  	BZHIQ  R14, R15, R15
   752  	MOVQ   CX, DX
   753  	MOVQ   SI, CX
   754  	SHRQ   $0x20, CX
   755  	ADDQ   R15, CX
   756  	MOVQ   CX, (R9)
   757  
   758  	// Fill bitreader for state updates
   759  	MOVQ    R13, (SP)
   760  	MOVQ    $0x00000808, CX
   761  	BEXTRQ  CX, R8, R13
   762  	MOVQ    ctx+16(FP), CX
   763  	CMPQ    96(CX), $0x00
   764  	JZ      sequenceDecs_decode_bmi2_skip_update
   765  	LEAQ    (SI)(DI*1), R14
   766  	ADDQ    R8, R14
   767  	MOVBQZX R14, R14
   768  	LEAQ    (DX)(R14*1), CX
   769  	MOVQ    AX, R15
   770  	MOVQ    CX, DX
   771  	ROLQ    CL, R15
   772  	BZHIQ   R14, R15, R15
   773  
   774  	// Update Offset State
   775  	BZHIQ  R8, R15, CX
   776  	SHRXQ  R8, R15, R15
   777  	MOVQ   $0x00001010, R14
   778  	BEXTRQ R14, R8, R8
   779  	ADDQ   CX, R8
   780  
   781  	// Load ctx.ofTable
   782  	MOVQ ctx+16(FP), CX
   783  	MOVQ 48(CX), CX
   784  	MOVQ (CX)(R8*8), R8
   785  
   786  	// Update Match Length State
   787  	BZHIQ  DI, R15, CX
   788  	SHRXQ  DI, R15, R15
   789  	MOVQ   $0x00001010, R14
   790  	BEXTRQ R14, DI, DI
   791  	ADDQ   CX, DI
   792  
   793  	// Load ctx.mlTable
   794  	MOVQ ctx+16(FP), CX
   795  	MOVQ 24(CX), CX
   796  	MOVQ (CX)(DI*8), DI
   797  
   798  	// Update Literal Length State
   799  	BZHIQ  SI, R15, CX
   800  	MOVQ   $0x00001010, R14
   801  	BEXTRQ R14, SI, SI
   802  	ADDQ   CX, SI
   803  
   804  	// Load ctx.llTable
   805  	MOVQ ctx+16(FP), CX
   806  	MOVQ (CX), CX
   807  	MOVQ (CX)(SI*8), SI
   808  
   809  sequenceDecs_decode_bmi2_skip_update:
   810  	// Adjust offset
   811  	MOVQ 16(R9), CX
   812  	CMPQ R13, $0x01
   813  	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
   814  	MOVQ R11, R12
   815  	MOVQ R10, R11
   816  	MOVQ CX, R10
   817  	JMP  sequenceDecs_decode_bmi2_after_adjust
   818  
   819  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
   820  	CMPQ (R9), $0x00000000
   821  	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
   822  	INCQ CX
   823  	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
   824  
   825  sequenceDecs_decode_bmi2_adjust_offset_maybezero:
   826  	TESTQ CX, CX
   827  	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
   828  	MOVQ  R10, CX
   829  	JMP   sequenceDecs_decode_bmi2_after_adjust
   830  
   831  sequenceDecs_decode_bmi2_adjust_offset_nonzero:
   832  	CMPQ CX, $0x01
   833  	JB   sequenceDecs_decode_bmi2_adjust_zero
   834  	JEQ  sequenceDecs_decode_bmi2_adjust_one
   835  	CMPQ CX, $0x02
   836  	JA   sequenceDecs_decode_bmi2_adjust_three
   837  	JMP  sequenceDecs_decode_bmi2_adjust_two
   838  
   839  sequenceDecs_decode_bmi2_adjust_zero:
   840  	MOVQ R10, R13
   841  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   842  
   843  sequenceDecs_decode_bmi2_adjust_one:
   844  	MOVQ R11, R13
   845  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   846  
   847  sequenceDecs_decode_bmi2_adjust_two:
   848  	MOVQ R12, R13
   849  	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   850  
   851  sequenceDecs_decode_bmi2_adjust_three:
   852  	LEAQ -1(R10), R13
   853  
   854  sequenceDecs_decode_bmi2_adjust_test_temp_valid:
   855  	TESTQ R13, R13
   856  	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
   857  	MOVQ  $0x00000001, R13
   858  
   859  sequenceDecs_decode_bmi2_adjust_temp_valid:
   860  	CMPQ    CX, $0x01
   861  	CMOVQNE R11, R12
   862  	MOVQ    R10, R11
   863  	MOVQ    R13, R10
   864  	MOVQ    R13, CX
   865  
   866  sequenceDecs_decode_bmi2_after_adjust:
   867  	MOVQ CX, 16(R9)
   868  
   869  	// Check values
   870  	MOVQ  8(R9), R13
   871  	MOVQ  (R9), R14
   872  	LEAQ  (R13)(R14*1), R15
   873  	MOVQ  s+0(FP), BP
   874  	ADDQ  R15, 256(BP)
   875  	MOVQ  ctx+16(FP), R15
   876  	SUBQ  R14, 128(R15)
   877  	JS    error_not_enough_literals
   878  	CMPQ  R13, $0x00020002
   879  	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
   880  	TESTQ CX, CX
   881  	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
   882  	TESTQ R13, R13
   883  	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
   884  
   885  sequenceDecs_decode_bmi2_match_len_ofs_ok:
   886  	ADDQ $0x18, R9
   887  	MOVQ ctx+16(FP), CX
   888  	DECQ 96(CX)
   889  	JNS  sequenceDecs_decode_bmi2_main_loop
   890  	MOVQ s+0(FP), CX
   891  	MOVQ R10, 144(CX)
   892  	MOVQ R11, 152(CX)
   893  	MOVQ R12, 160(CX)
   894  	MOVQ br+8(FP), CX
   895  	MOVQ AX, 32(CX)
   896  	MOVB DL, 40(CX)
   897  	MOVQ BX, 24(CX)
   898  
   899  	// Return success
   900  	MOVQ $0x00000000, ret+24(FP)
   901  	RET
   902  
   903  	// Return with match length error
   904  sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
   905  	MOVQ $0x00000001, ret+24(FP)
   906  	RET
   907  
   908  	// Return with match too long error
   909  sequenceDecs_decode_bmi2_error_match_len_too_big:
   910  	MOVQ $0x00000002, ret+24(FP)
   911  	RET
   912  
   913  	// Return with match offset too long error
   914  	MOVQ $0x00000003, ret+24(FP)
   915  	RET
   916  
   917  	// Return with not enough literals error
   918  error_not_enough_literals:
   919  	MOVQ $0x00000004, ret+24(FP)
   920  	RET
   921  
   922  	// Return with overread error
   923  error_overread:
   924  	MOVQ $0x00000006, ret+24(FP)
   925  	RET
   926  
   927  // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   928  // Requires: BMI, BMI2, CMOV
   929  TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
   930  	MOVQ    br+8(FP), CX
   931  	MOVQ    32(CX), AX
   932  	MOVBQZX 40(CX), DX
   933  	MOVQ    24(CX), BX
   934  	MOVQ    (CX), CX
   935  	ADDQ    BX, CX
   936  	MOVQ    CX, (SP)
   937  	MOVQ    ctx+16(FP), CX
   938  	MOVQ    72(CX), SI
   939  	MOVQ    80(CX), DI
   940  	MOVQ    88(CX), R8
   941  	MOVQ    104(CX), R9
   942  	MOVQ    s+0(FP), CX
   943  	MOVQ    144(CX), R10
   944  	MOVQ    152(CX), R11
   945  	MOVQ    160(CX), R12
   946  
   947  sequenceDecs_decode_56_bmi2_main_loop:
   948  	MOVQ (SP), R13
   949  
   950  	// Fill bitreader to have enough for the offset and match length.
   951  	CMPQ BX, $0x08
   952  	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
   953  	MOVQ DX, CX
   954  	SHRQ $0x03, CX
   955  	SUBQ CX, R13
   956  	MOVQ (R13), AX
   957  	SUBQ CX, BX
   958  	ANDQ $0x07, DX
   959  	JMP  sequenceDecs_decode_56_bmi2_fill_end
   960  
   961  sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
   962  	CMPQ    BX, $0x00
   963  	JLE     sequenceDecs_decode_56_bmi2_fill_check_overread
   964  	CMPQ    DX, $0x07
   965  	JLE     sequenceDecs_decode_56_bmi2_fill_end
   966  	SHLQ    $0x08, AX
   967  	SUBQ    $0x01, R13
   968  	SUBQ    $0x01, BX
   969  	SUBQ    $0x08, DX
   970  	MOVBQZX (R13), CX
   971  	ORQ     CX, AX
   972  	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
   973  
   974  sequenceDecs_decode_56_bmi2_fill_check_overread:
   975  	CMPQ DX, $0x40
   976  	JA   error_overread
   977  
   978  sequenceDecs_decode_56_bmi2_fill_end:
   979  	// Update offset
   980  	MOVQ   $0x00000808, CX
   981  	BEXTRQ CX, R8, R14
   982  	MOVQ   AX, R15
   983  	LEAQ   (DX)(R14*1), CX
   984  	ROLQ   CL, R15
   985  	BZHIQ  R14, R15, R15
   986  	MOVQ   CX, DX
   987  	MOVQ   R8, CX
   988  	SHRQ   $0x20, CX
   989  	ADDQ   R15, CX
   990  	MOVQ   CX, 16(R9)
   991  
   992  	// Update match length
   993  	MOVQ   $0x00000808, CX
   994  	BEXTRQ CX, DI, R14
   995  	MOVQ   AX, R15
   996  	LEAQ   (DX)(R14*1), CX
   997  	ROLQ   CL, R15
   998  	BZHIQ  R14, R15, R15
   999  	MOVQ   CX, DX
  1000  	MOVQ   DI, CX
  1001  	SHRQ   $0x20, CX
  1002  	ADDQ   R15, CX
  1003  	MOVQ   CX, 8(R9)
  1004  
  1005  	// Update literal length
  1006  	MOVQ   $0x00000808, CX
  1007  	BEXTRQ CX, SI, R14
  1008  	MOVQ   AX, R15
  1009  	LEAQ   (DX)(R14*1), CX
  1010  	ROLQ   CL, R15
  1011  	BZHIQ  R14, R15, R15
  1012  	MOVQ   CX, DX
  1013  	MOVQ   SI, CX
  1014  	SHRQ   $0x20, CX
  1015  	ADDQ   R15, CX
  1016  	MOVQ   CX, (R9)
  1017  
  1018  	// Fill bitreader for state updates
  1019  	MOVQ    R13, (SP)
  1020  	MOVQ    $0x00000808, CX
  1021  	BEXTRQ  CX, R8, R13
  1022  	MOVQ    ctx+16(FP), CX
  1023  	CMPQ    96(CX), $0x00
  1024  	JZ      sequenceDecs_decode_56_bmi2_skip_update
  1025  	LEAQ    (SI)(DI*1), R14
  1026  	ADDQ    R8, R14
  1027  	MOVBQZX R14, R14
  1028  	LEAQ    (DX)(R14*1), CX
  1029  	MOVQ    AX, R15
  1030  	MOVQ    CX, DX
  1031  	ROLQ    CL, R15
  1032  	BZHIQ   R14, R15, R15
  1033  
  1034  	// Update Offset State
  1035  	BZHIQ  R8, R15, CX
  1036  	SHRXQ  R8, R15, R15
  1037  	MOVQ   $0x00001010, R14
  1038  	BEXTRQ R14, R8, R8
  1039  	ADDQ   CX, R8
  1040  
  1041  	// Load ctx.ofTable
  1042  	MOVQ ctx+16(FP), CX
  1043  	MOVQ 48(CX), CX
  1044  	MOVQ (CX)(R8*8), R8
  1045  
  1046  	// Update Match Length State
  1047  	BZHIQ  DI, R15, CX
  1048  	SHRXQ  DI, R15, R15
  1049  	MOVQ   $0x00001010, R14
  1050  	BEXTRQ R14, DI, DI
  1051  	ADDQ   CX, DI
  1052  
  1053  	// Load ctx.mlTable
  1054  	MOVQ ctx+16(FP), CX
  1055  	MOVQ 24(CX), CX
  1056  	MOVQ (CX)(DI*8), DI
  1057  
  1058  	// Update Literal Length State
  1059  	BZHIQ  SI, R15, CX
  1060  	MOVQ   $0x00001010, R14
  1061  	BEXTRQ R14, SI, SI
  1062  	ADDQ   CX, SI
  1063  
  1064  	// Load ctx.llTable
  1065  	MOVQ ctx+16(FP), CX
  1066  	MOVQ (CX), CX
  1067  	MOVQ (CX)(SI*8), SI
  1068  
  1069  sequenceDecs_decode_56_bmi2_skip_update:
  1070  	// Adjust offset
  1071  	MOVQ 16(R9), CX
  1072  	CMPQ R13, $0x01
  1073  	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
  1074  	MOVQ R11, R12
  1075  	MOVQ R10, R11
  1076  	MOVQ CX, R10
  1077  	JMP  sequenceDecs_decode_56_bmi2_after_adjust
  1078  
  1079  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
  1080  	CMPQ (R9), $0x00000000
  1081  	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
  1082  	INCQ CX
  1083  	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  1084  
  1085  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
  1086  	TESTQ CX, CX
  1087  	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  1088  	MOVQ  R10, CX
  1089  	JMP   sequenceDecs_decode_56_bmi2_after_adjust
  1090  
  1091  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
  1092  	CMPQ CX, $0x01
  1093  	JB   sequenceDecs_decode_56_bmi2_adjust_zero
  1094  	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
  1095  	CMPQ CX, $0x02
  1096  	JA   sequenceDecs_decode_56_bmi2_adjust_three
  1097  	JMP  sequenceDecs_decode_56_bmi2_adjust_two
  1098  
  1099  sequenceDecs_decode_56_bmi2_adjust_zero:
  1100  	MOVQ R10, R13
  1101  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1102  
  1103  sequenceDecs_decode_56_bmi2_adjust_one:
  1104  	MOVQ R11, R13
  1105  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1106  
  1107  sequenceDecs_decode_56_bmi2_adjust_two:
  1108  	MOVQ R12, R13
  1109  	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1110  
  1111  sequenceDecs_decode_56_bmi2_adjust_three:
  1112  	LEAQ -1(R10), R13
  1113  
  1114  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
  1115  	TESTQ R13, R13
  1116  	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
  1117  	MOVQ  $0x00000001, R13
  1118  
  1119  sequenceDecs_decode_56_bmi2_adjust_temp_valid:
  1120  	CMPQ    CX, $0x01
  1121  	CMOVQNE R11, R12
  1122  	MOVQ    R10, R11
  1123  	MOVQ    R13, R10
  1124  	MOVQ    R13, CX
  1125  
  1126  sequenceDecs_decode_56_bmi2_after_adjust:
  1127  	MOVQ CX, 16(R9)
  1128  
  1129  	// Check values
  1130  	MOVQ  8(R9), R13
  1131  	MOVQ  (R9), R14
  1132  	LEAQ  (R13)(R14*1), R15
  1133  	MOVQ  s+0(FP), BP
  1134  	ADDQ  R15, 256(BP)
  1135  	MOVQ  ctx+16(FP), R15
  1136  	SUBQ  R14, 128(R15)
  1137  	JS    error_not_enough_literals
  1138  	CMPQ  R13, $0x00020002
  1139  	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
  1140  	TESTQ CX, CX
  1141  	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
  1142  	TESTQ R13, R13
  1143  	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
  1144  
  1145  sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
  1146  	ADDQ $0x18, R9
  1147  	MOVQ ctx+16(FP), CX
  1148  	DECQ 96(CX)
  1149  	JNS  sequenceDecs_decode_56_bmi2_main_loop
  1150  	MOVQ s+0(FP), CX
  1151  	MOVQ R10, 144(CX)
  1152  	MOVQ R11, 152(CX)
  1153  	MOVQ R12, 160(CX)
  1154  	MOVQ br+8(FP), CX
  1155  	MOVQ AX, 32(CX)
  1156  	MOVB DL, 40(CX)
  1157  	MOVQ BX, 24(CX)
  1158  
  1159  	// Return success
  1160  	MOVQ $0x00000000, ret+24(FP)
  1161  	RET
  1162  
  1163  	// Return with match length error
  1164  sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
  1165  	MOVQ $0x00000001, ret+24(FP)
  1166  	RET
  1167  
  1168  	// Return with match too long error
  1169  sequenceDecs_decode_56_bmi2_error_match_len_too_big:
  1170  	MOVQ $0x00000002, ret+24(FP)
  1171  	RET
  1172  
  1173  	// Return with match offset too long error
  1174  	MOVQ $0x00000003, ret+24(FP)
  1175  	RET
  1176  
  1177  	// Return with not enough literals error
  1178  error_not_enough_literals:
  1179  	MOVQ $0x00000004, ret+24(FP)
  1180  	RET
  1181  
  1182  	// Return with overread error
  1183  error_overread:
  1184  	MOVQ $0x00000006, ret+24(FP)
  1185  	RET
  1186  
  1187  // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
  1188  // Requires: SSE
  1189  TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
  1190  	MOVQ  ctx+0(FP), R10
  1191  	MOVQ  8(R10), CX
  1192  	TESTQ CX, CX
  1193  	JZ    empty_seqs
  1194  	MOVQ  (R10), AX
  1195  	MOVQ  24(R10), DX
  1196  	MOVQ  32(R10), BX
  1197  	MOVQ  80(R10), SI
  1198  	MOVQ  104(R10), DI
  1199  	MOVQ  120(R10), R8
  1200  	MOVQ  56(R10), R9
  1201  	MOVQ  64(R10), R10
  1202  	ADDQ  R10, R9
  1203  
  1204  	// seqsBase += 24 * seqIndex
  1205  	LEAQ (DX)(DX*2), R11
  1206  	SHLQ $0x03, R11
  1207  	ADDQ R11, AX
  1208  
  1209  	// outBase += outPosition
  1210  	ADDQ DI, BX
  1211  
  1212  main_loop:
  1213  	MOVQ (AX), R11
  1214  	MOVQ 16(AX), R12
  1215  	MOVQ 8(AX), R13
  1216  
  1217  	// Copy literals
  1218  	TESTQ R11, R11
  1219  	JZ    check_offset
  1220  	XORQ  R14, R14
  1221  
  1222  copy_1:
  1223  	MOVUPS (SI)(R14*1), X0
  1224  	MOVUPS X0, (BX)(R14*1)
  1225  	ADDQ   $0x10, R14
  1226  	CMPQ   R14, R11
  1227  	JB     copy_1
  1228  	ADDQ   R11, SI
  1229  	ADDQ   R11, BX
  1230  	ADDQ   R11, DI
  1231  
  1232  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1233  check_offset:
  1234  	LEAQ (DI)(R10*1), R11
  1235  	CMPQ R12, R11
  1236  	JG   error_match_off_too_big
  1237  	CMPQ R12, R8
  1238  	JG   error_match_off_too_big
  1239  
  1240  	// Copy match from history
  1241  	MOVQ R12, R11
  1242  	SUBQ DI, R11
  1243  	JLS  copy_match
  1244  	MOVQ R9, R14
  1245  	SUBQ R11, R14
  1246  	CMPQ R13, R11
  1247  	JG   copy_all_from_history
  1248  	MOVQ R13, R11
  1249  	SUBQ $0x10, R11
  1250  	JB   copy_4_small
  1251  
  1252  copy_4_loop:
  1253  	MOVUPS (R14), X0
  1254  	MOVUPS X0, (BX)
  1255  	ADDQ   $0x10, R14
  1256  	ADDQ   $0x10, BX
  1257  	SUBQ   $0x10, R11
  1258  	JAE    copy_4_loop
  1259  	LEAQ   16(R14)(R11*1), R14
  1260  	LEAQ   16(BX)(R11*1), BX
  1261  	MOVUPS -16(R14), X0
  1262  	MOVUPS X0, -16(BX)
  1263  	JMP    copy_4_end
  1264  
  1265  copy_4_small:
  1266  	CMPQ R13, $0x03
  1267  	JE   copy_4_move_3
  1268  	CMPQ R13, $0x08
  1269  	JB   copy_4_move_4through7
  1270  	JMP  copy_4_move_8through16
  1271  
  1272  copy_4_move_3:
  1273  	MOVW (R14), R11
  1274  	MOVB 2(R14), R12
  1275  	MOVW R11, (BX)
  1276  	MOVB R12, 2(BX)
  1277  	ADDQ R13, R14
  1278  	ADDQ R13, BX
  1279  	JMP  copy_4_end
  1280  
  1281  copy_4_move_4through7:
  1282  	MOVL (R14), R11
  1283  	MOVL -4(R14)(R13*1), R12
  1284  	MOVL R11, (BX)
  1285  	MOVL R12, -4(BX)(R13*1)
  1286  	ADDQ R13, R14
  1287  	ADDQ R13, BX
  1288  	JMP  copy_4_end
  1289  
  1290  copy_4_move_8through16:
  1291  	MOVQ (R14), R11
  1292  	MOVQ -8(R14)(R13*1), R12
  1293  	MOVQ R11, (BX)
  1294  	MOVQ R12, -8(BX)(R13*1)
  1295  	ADDQ R13, R14
  1296  	ADDQ R13, BX
  1297  
  1298  copy_4_end:
  1299  	ADDQ R13, DI
  1300  	ADDQ $0x18, AX
  1301  	INCQ DX
  1302  	CMPQ DX, CX
  1303  	JB   main_loop
  1304  	JMP  loop_finished
  1305  
  1306  copy_all_from_history:
  1307  	MOVQ R11, R15
  1308  	SUBQ $0x10, R15
  1309  	JB   copy_5_small
  1310  
  1311  copy_5_loop:
  1312  	MOVUPS (R14), X0
  1313  	MOVUPS X0, (BX)
  1314  	ADDQ   $0x10, R14
  1315  	ADDQ   $0x10, BX
  1316  	SUBQ   $0x10, R15
  1317  	JAE    copy_5_loop
  1318  	LEAQ   16(R14)(R15*1), R14
  1319  	LEAQ   16(BX)(R15*1), BX
  1320  	MOVUPS -16(R14), X0
  1321  	MOVUPS X0, -16(BX)
  1322  	JMP    copy_5_end
  1323  
  1324  copy_5_small:
  1325  	CMPQ R11, $0x03
  1326  	JE   copy_5_move_3
  1327  	JB   copy_5_move_1or2
  1328  	CMPQ R11, $0x08
  1329  	JB   copy_5_move_4through7
  1330  	JMP  copy_5_move_8through16
  1331  
  1332  copy_5_move_1or2:
  1333  	MOVB (R14), R15
  1334  	MOVB -1(R14)(R11*1), BP
  1335  	MOVB R15, (BX)
  1336  	MOVB BP, -1(BX)(R11*1)
  1337  	ADDQ R11, R14
  1338  	ADDQ R11, BX
  1339  	JMP  copy_5_end
  1340  
  1341  copy_5_move_3:
  1342  	MOVW (R14), R15
  1343  	MOVB 2(R14), BP
  1344  	MOVW R15, (BX)
  1345  	MOVB BP, 2(BX)
  1346  	ADDQ R11, R14
  1347  	ADDQ R11, BX
  1348  	JMP  copy_5_end
  1349  
  1350  copy_5_move_4through7:
  1351  	MOVL (R14), R15
  1352  	MOVL -4(R14)(R11*1), BP
  1353  	MOVL R15, (BX)
  1354  	MOVL BP, -4(BX)(R11*1)
  1355  	ADDQ R11, R14
  1356  	ADDQ R11, BX
  1357  	JMP  copy_5_end
  1358  
  1359  copy_5_move_8through16:
  1360  	MOVQ (R14), R15
  1361  	MOVQ -8(R14)(R11*1), BP
  1362  	MOVQ R15, (BX)
  1363  	MOVQ BP, -8(BX)(R11*1)
  1364  	ADDQ R11, R14
  1365  	ADDQ R11, BX
  1366  
  1367  copy_5_end:
  1368  	ADDQ R11, DI
  1369  	SUBQ R11, R13
  1370  
  1371  	// Copy match from the current buffer
  1372  copy_match:
  1373  	MOVQ BX, R11
  1374  	SUBQ R12, R11
  1375  
  1376  	// ml <= mo
  1377  	CMPQ R13, R12
  1378  	JA   copy_overlapping_match
  1379  
  1380  	// Copy non-overlapping match
  1381  	ADDQ R13, DI
  1382  	MOVQ BX, R12
  1383  	ADDQ R13, BX
  1384  
  1385  copy_2:
  1386  	MOVUPS (R11), X0
  1387  	MOVUPS X0, (R12)
  1388  	ADDQ   $0x10, R11
  1389  	ADDQ   $0x10, R12
  1390  	SUBQ   $0x10, R13
  1391  	JHI    copy_2
  1392  	JMP    handle_loop
  1393  
  1394  	// Copy overlapping match
  1395  copy_overlapping_match:
  1396  	ADDQ R13, DI
  1397  
  1398  copy_slow_3:
  1399  	MOVB (R11), R12
  1400  	MOVB R12, (BX)
  1401  	INCQ R11
  1402  	INCQ BX
  1403  	DECQ R13
  1404  	JNZ  copy_slow_3
  1405  
  1406  handle_loop:
  1407  	ADDQ $0x18, AX
  1408  	INCQ DX
  1409  	CMPQ DX, CX
  1410  	JB   main_loop
  1411  
  1412  loop_finished:
  1413  	// Return value
  1414  	MOVB $0x01, ret+8(FP)
  1415  
  1416  	// Update the context
  1417  	MOVQ ctx+0(FP), AX
  1418  	MOVQ DX, 24(AX)
  1419  	MOVQ DI, 104(AX)
  1420  	SUBQ 80(AX), SI
  1421  	MOVQ SI, 112(AX)
  1422  	RET
  1423  
  1424  error_match_off_too_big:
  1425  	// Return value
  1426  	MOVB $0x00, ret+8(FP)
  1427  
  1428  	// Update the context
  1429  	MOVQ ctx+0(FP), AX
  1430  	MOVQ DX, 24(AX)
  1431  	MOVQ DI, 104(AX)
  1432  	SUBQ 80(AX), SI
  1433  	MOVQ SI, 112(AX)
  1434  	RET
  1435  
  1436  empty_seqs:
  1437  	// Return value
  1438  	MOVB $0x01, ret+8(FP)
  1439  	RET
  1440  
  1441  // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
  1442  // Requires: SSE
  1443  TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
  1444  	MOVQ  ctx+0(FP), R10
  1445  	MOVQ  8(R10), CX
  1446  	TESTQ CX, CX
  1447  	JZ    empty_seqs
  1448  	MOVQ  (R10), AX
  1449  	MOVQ  24(R10), DX
  1450  	MOVQ  32(R10), BX
  1451  	MOVQ  80(R10), SI
  1452  	MOVQ  104(R10), DI
  1453  	MOVQ  120(R10), R8
  1454  	MOVQ  56(R10), R9
  1455  	MOVQ  64(R10), R10
  1456  	ADDQ  R10, R9
  1457  
  1458  	// seqsBase += 24 * seqIndex
  1459  	LEAQ (DX)(DX*2), R11
  1460  	SHLQ $0x03, R11
  1461  	ADDQ R11, AX
  1462  
  1463  	// outBase += outPosition
  1464  	ADDQ DI, BX
  1465  
  1466  main_loop:
  1467  	MOVQ (AX), R11
  1468  	MOVQ 16(AX), R12
  1469  	MOVQ 8(AX), R13
  1470  
  1471  	// Copy literals
  1472  	TESTQ R11, R11
  1473  	JZ    check_offset
  1474  	MOVQ  R11, R14
  1475  	SUBQ  $0x10, R14
  1476  	JB    copy_1_small
  1477  
  1478  copy_1_loop:
  1479  	MOVUPS (SI), X0
  1480  	MOVUPS X0, (BX)
  1481  	ADDQ   $0x10, SI
  1482  	ADDQ   $0x10, BX
  1483  	SUBQ   $0x10, R14
  1484  	JAE    copy_1_loop
  1485  	LEAQ   16(SI)(R14*1), SI
  1486  	LEAQ   16(BX)(R14*1), BX
  1487  	MOVUPS -16(SI), X0
  1488  	MOVUPS X0, -16(BX)
  1489  	JMP    copy_1_end
  1490  
  1491  copy_1_small:
  1492  	CMPQ R11, $0x03
  1493  	JE   copy_1_move_3
  1494  	JB   copy_1_move_1or2
  1495  	CMPQ R11, $0x08
  1496  	JB   copy_1_move_4through7
  1497  	JMP  copy_1_move_8through16
  1498  
  1499  copy_1_move_1or2:
  1500  	MOVB (SI), R14
  1501  	MOVB -1(SI)(R11*1), R15
  1502  	MOVB R14, (BX)
  1503  	MOVB R15, -1(BX)(R11*1)
  1504  	ADDQ R11, SI
  1505  	ADDQ R11, BX
  1506  	JMP  copy_1_end
  1507  
  1508  copy_1_move_3:
  1509  	MOVW (SI), R14
  1510  	MOVB 2(SI), R15
  1511  	MOVW R14, (BX)
  1512  	MOVB R15, 2(BX)
  1513  	ADDQ R11, SI
  1514  	ADDQ R11, BX
  1515  	JMP  copy_1_end
  1516  
  1517  copy_1_move_4through7:
  1518  	MOVL (SI), R14
  1519  	MOVL -4(SI)(R11*1), R15
  1520  	MOVL R14, (BX)
  1521  	MOVL R15, -4(BX)(R11*1)
  1522  	ADDQ R11, SI
  1523  	ADDQ R11, BX
  1524  	JMP  copy_1_end
  1525  
  1526  copy_1_move_8through16:
  1527  	MOVQ (SI), R14
  1528  	MOVQ -8(SI)(R11*1), R15
  1529  	MOVQ R14, (BX)
  1530  	MOVQ R15, -8(BX)(R11*1)
  1531  	ADDQ R11, SI
  1532  	ADDQ R11, BX
  1533  
  1534  copy_1_end:
  1535  	ADDQ R11, DI
  1536  
  1537  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1538  check_offset:
  1539  	LEAQ (DI)(R10*1), R11
  1540  	CMPQ R12, R11
  1541  	JG   error_match_off_too_big
  1542  	CMPQ R12, R8
  1543  	JG   error_match_off_too_big
  1544  
  1545  	// Copy match from history
  1546  	MOVQ R12, R11
  1547  	SUBQ DI, R11
  1548  	JLS  copy_match
  1549  	MOVQ R9, R14
  1550  	SUBQ R11, R14
  1551  	CMPQ R13, R11
  1552  	JG   copy_all_from_history
  1553  	MOVQ R13, R11
  1554  	SUBQ $0x10, R11
  1555  	JB   copy_4_small
  1556  
  1557  copy_4_loop:
  1558  	MOVUPS (R14), X0
  1559  	MOVUPS X0, (BX)
  1560  	ADDQ   $0x10, R14
  1561  	ADDQ   $0x10, BX
  1562  	SUBQ   $0x10, R11
  1563  	JAE    copy_4_loop
  1564  	LEAQ   16(R14)(R11*1), R14
  1565  	LEAQ   16(BX)(R11*1), BX
  1566  	MOVUPS -16(R14), X0
  1567  	MOVUPS X0, -16(BX)
  1568  	JMP    copy_4_end
  1569  
  1570  copy_4_small:
  1571  	CMPQ R13, $0x03
  1572  	JE   copy_4_move_3
  1573  	CMPQ R13, $0x08
  1574  	JB   copy_4_move_4through7
  1575  	JMP  copy_4_move_8through16
  1576  
  1577  copy_4_move_3:
  1578  	MOVW (R14), R11
  1579  	MOVB 2(R14), R12
  1580  	MOVW R11, (BX)
  1581  	MOVB R12, 2(BX)
  1582  	ADDQ R13, R14
  1583  	ADDQ R13, BX
  1584  	JMP  copy_4_end
  1585  
  1586  copy_4_move_4through7:
  1587  	MOVL (R14), R11
  1588  	MOVL -4(R14)(R13*1), R12
  1589  	MOVL R11, (BX)
  1590  	MOVL R12, -4(BX)(R13*1)
  1591  	ADDQ R13, R14
  1592  	ADDQ R13, BX
  1593  	JMP  copy_4_end
  1594  
  1595  copy_4_move_8through16:
  1596  	MOVQ (R14), R11
  1597  	MOVQ -8(R14)(R13*1), R12
  1598  	MOVQ R11, (BX)
  1599  	MOVQ R12, -8(BX)(R13*1)
  1600  	ADDQ R13, R14
  1601  	ADDQ R13, BX
  1602  
  1603  copy_4_end:
  1604  	ADDQ R13, DI
  1605  	ADDQ $0x18, AX
  1606  	INCQ DX
  1607  	CMPQ DX, CX
  1608  	JB   main_loop
  1609  	JMP  loop_finished
  1610  
  1611  copy_all_from_history:
  1612  	MOVQ R11, R15
  1613  	SUBQ $0x10, R15
  1614  	JB   copy_5_small
  1615  
  1616  copy_5_loop:
  1617  	MOVUPS (R14), X0
  1618  	MOVUPS X0, (BX)
  1619  	ADDQ   $0x10, R14
  1620  	ADDQ   $0x10, BX
  1621  	SUBQ   $0x10, R15
  1622  	JAE    copy_5_loop
  1623  	LEAQ   16(R14)(R15*1), R14
  1624  	LEAQ   16(BX)(R15*1), BX
  1625  	MOVUPS -16(R14), X0
  1626  	MOVUPS X0, -16(BX)
  1627  	JMP    copy_5_end
  1628  
  1629  copy_5_small:
  1630  	CMPQ R11, $0x03
  1631  	JE   copy_5_move_3
  1632  	JB   copy_5_move_1or2
  1633  	CMPQ R11, $0x08
  1634  	JB   copy_5_move_4through7
  1635  	JMP  copy_5_move_8through16
  1636  
  1637  copy_5_move_1or2:
  1638  	MOVB (R14), R15
  1639  	MOVB -1(R14)(R11*1), BP
  1640  	MOVB R15, (BX)
  1641  	MOVB BP, -1(BX)(R11*1)
  1642  	ADDQ R11, R14
  1643  	ADDQ R11, BX
  1644  	JMP  copy_5_end
  1645  
  1646  copy_5_move_3:
  1647  	MOVW (R14), R15
  1648  	MOVB 2(R14), BP
  1649  	MOVW R15, (BX)
  1650  	MOVB BP, 2(BX)
  1651  	ADDQ R11, R14
  1652  	ADDQ R11, BX
  1653  	JMP  copy_5_end
  1654  
  1655  copy_5_move_4through7:
  1656  	MOVL (R14), R15
  1657  	MOVL -4(R14)(R11*1), BP
  1658  	MOVL R15, (BX)
  1659  	MOVL BP, -4(BX)(R11*1)
  1660  	ADDQ R11, R14
  1661  	ADDQ R11, BX
  1662  	JMP  copy_5_end
  1663  
  1664  copy_5_move_8through16:
  1665  	MOVQ (R14), R15
  1666  	MOVQ -8(R14)(R11*1), BP
  1667  	MOVQ R15, (BX)
  1668  	MOVQ BP, -8(BX)(R11*1)
  1669  	ADDQ R11, R14
  1670  	ADDQ R11, BX
  1671  
  1672  copy_5_end:
  1673  	ADDQ R11, DI
  1674  	SUBQ R11, R13
  1675  
  1676  	// Copy match from the current buffer
  1677  copy_match:
  1678  	MOVQ BX, R11
  1679  	SUBQ R12, R11
  1680  
  1681  	// ml <= mo
  1682  	CMPQ R13, R12
  1683  	JA   copy_overlapping_match
  1684  
  1685  	// Copy non-overlapping match
  1686  	ADDQ R13, DI
  1687  	MOVQ R13, R12
  1688  	SUBQ $0x10, R12
  1689  	JB   copy_2_small
  1690  
  1691  copy_2_loop:
  1692  	MOVUPS (R11), X0
  1693  	MOVUPS X0, (BX)
  1694  	ADDQ   $0x10, R11
  1695  	ADDQ   $0x10, BX
  1696  	SUBQ   $0x10, R12
  1697  	JAE    copy_2_loop
  1698  	LEAQ   16(R11)(R12*1), R11
  1699  	LEAQ   16(BX)(R12*1), BX
  1700  	MOVUPS -16(R11), X0
  1701  	MOVUPS X0, -16(BX)
  1702  	JMP    copy_2_end
  1703  
  1704  copy_2_small:
  1705  	CMPQ R13, $0x03
  1706  	JE   copy_2_move_3
  1707  	JB   copy_2_move_1or2
  1708  	CMPQ R13, $0x08
  1709  	JB   copy_2_move_4through7
  1710  	JMP  copy_2_move_8through16
  1711  
  1712  copy_2_move_1or2:
  1713  	MOVB (R11), R12
  1714  	MOVB -1(R11)(R13*1), R14
  1715  	MOVB R12, (BX)
  1716  	MOVB R14, -1(BX)(R13*1)
  1717  	ADDQ R13, R11
  1718  	ADDQ R13, BX
  1719  	JMP  copy_2_end
  1720  
  1721  copy_2_move_3:
  1722  	MOVW (R11), R12
  1723  	MOVB 2(R11), R14
  1724  	MOVW R12, (BX)
  1725  	MOVB R14, 2(BX)
  1726  	ADDQ R13, R11
  1727  	ADDQ R13, BX
  1728  	JMP  copy_2_end
  1729  
  1730  copy_2_move_4through7:
  1731  	MOVL (R11), R12
  1732  	MOVL -4(R11)(R13*1), R14
  1733  	MOVL R12, (BX)
  1734  	MOVL R14, -4(BX)(R13*1)
  1735  	ADDQ R13, R11
  1736  	ADDQ R13, BX
  1737  	JMP  copy_2_end
  1738  
  1739  copy_2_move_8through16:
  1740  	MOVQ (R11), R12
  1741  	MOVQ -8(R11)(R13*1), R14
  1742  	MOVQ R12, (BX)
  1743  	MOVQ R14, -8(BX)(R13*1)
  1744  	ADDQ R13, R11
  1745  	ADDQ R13, BX
  1746  
  1747  copy_2_end:
  1748  	JMP handle_loop
  1749  
  1750  	// Copy overlapping match
  1751  copy_overlapping_match:
  1752  	ADDQ R13, DI
  1753  
  1754  copy_slow_3:
  1755  	MOVB (R11), R12
  1756  	MOVB R12, (BX)
  1757  	INCQ R11
  1758  	INCQ BX
  1759  	DECQ R13
  1760  	JNZ  copy_slow_3
  1761  
  1762  handle_loop:
  1763  	ADDQ $0x18, AX
  1764  	INCQ DX
  1765  	CMPQ DX, CX
  1766  	JB   main_loop
  1767  
  1768  loop_finished:
  1769  	// Return value
  1770  	MOVB $0x01, ret+8(FP)
  1771  
  1772  	// Update the context
  1773  	MOVQ ctx+0(FP), AX
  1774  	MOVQ DX, 24(AX)
  1775  	MOVQ DI, 104(AX)
  1776  	SUBQ 80(AX), SI
  1777  	MOVQ SI, 112(AX)
  1778  	RET
  1779  
  1780  error_match_off_too_big:
  1781  	// Return value
  1782  	MOVB $0x00, ret+8(FP)
  1783  
  1784  	// Update the context
  1785  	MOVQ ctx+0(FP), AX
  1786  	MOVQ DX, 24(AX)
  1787  	MOVQ DI, 104(AX)
  1788  	SUBQ 80(AX), SI
  1789  	MOVQ SI, 112(AX)
  1790  	RET
  1791  
  1792  empty_seqs:
  1793  	// Return value
  1794  	MOVB $0x01, ret+8(FP)
  1795  	RET
  1796  
  1797  // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  1798  // Requires: CMOV, SSE
  1799  TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
  1800  	MOVQ    br+8(FP), AX
  1801  	MOVQ    32(AX), DX
  1802  	MOVBQZX 40(AX), BX
  1803  	MOVQ    24(AX), SI
  1804  	MOVQ    (AX), AX
  1805  	ADDQ    SI, AX
  1806  	MOVQ    AX, (SP)
  1807  	MOVQ    ctx+16(FP), AX
  1808  	MOVQ    72(AX), DI
  1809  	MOVQ    80(AX), R8
  1810  	MOVQ    88(AX), R9
  1811  	XORQ    CX, CX
  1812  	MOVQ    CX, 8(SP)
  1813  	MOVQ    CX, 16(SP)
  1814  	MOVQ    CX, 24(SP)
  1815  	MOVQ    112(AX), R10
  1816  	MOVQ    128(AX), CX
  1817  	MOVQ    CX, 32(SP)
  1818  	MOVQ    144(AX), R11
  1819  	MOVQ    136(AX), R12
  1820  	MOVQ    200(AX), CX
  1821  	MOVQ    CX, 56(SP)
  1822  	MOVQ    176(AX), CX
  1823  	MOVQ    CX, 48(SP)
  1824  	MOVQ    184(AX), AX
  1825  	MOVQ    AX, 40(SP)
  1826  	MOVQ    40(SP), AX
  1827  	ADDQ    AX, 48(SP)
  1828  
  1829  	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  1830  	ADDQ R10, 32(SP)
  1831  
  1832  	// outBase += outPosition
  1833  	ADDQ R12, R10
  1834  
  1835  sequenceDecs_decodeSync_amd64_main_loop:
  1836  	MOVQ (SP), R13
  1837  
  1838  	// Fill bitreader to have enough for the offset and match length.
  1839  	CMPQ SI, $0x08
  1840  	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1841  	MOVQ BX, AX
  1842  	SHRQ $0x03, AX
  1843  	SUBQ AX, R13
  1844  	MOVQ (R13), DX
  1845  	SUBQ AX, SI
  1846  	ANDQ $0x07, BX
  1847  	JMP  sequenceDecs_decodeSync_amd64_fill_end
  1848  
  1849  sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
  1850  	CMPQ    SI, $0x00
  1851  	JLE     sequenceDecs_decodeSync_amd64_fill_check_overread
  1852  	CMPQ    BX, $0x07
  1853  	JLE     sequenceDecs_decodeSync_amd64_fill_end
  1854  	SHLQ    $0x08, DX
  1855  	SUBQ    $0x01, R13
  1856  	SUBQ    $0x01, SI
  1857  	SUBQ    $0x08, BX
  1858  	MOVBQZX (R13), AX
  1859  	ORQ     AX, DX
  1860  	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1861  
  1862  sequenceDecs_decodeSync_amd64_fill_check_overread:
  1863  	CMPQ BX, $0x40
  1864  	JA   error_overread
  1865  
  1866  sequenceDecs_decodeSync_amd64_fill_end:
  1867  	// Update offset
  1868  	MOVQ  R9, AX
  1869  	MOVQ  BX, CX
  1870  	MOVQ  DX, R14
  1871  	SHLQ  CL, R14
  1872  	MOVB  AH, CL
  1873  	SHRQ  $0x20, AX
  1874  	TESTQ CX, CX
  1875  	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
  1876  	ADDQ  CX, BX
  1877  	CMPQ  BX, $0x40
  1878  	JA    sequenceDecs_decodeSync_amd64_of_update_zero
  1879  	CMPQ  CX, $0x40
  1880  	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
  1881  	NEGQ  CX
  1882  	SHRQ  CL, R14
  1883  	ADDQ  R14, AX
  1884  
  1885  sequenceDecs_decodeSync_amd64_of_update_zero:
  1886  	MOVQ AX, 8(SP)
  1887  
  1888  	// Update match length
  1889  	MOVQ  R8, AX
  1890  	MOVQ  BX, CX
  1891  	MOVQ  DX, R14
  1892  	SHLQ  CL, R14
  1893  	MOVB  AH, CL
  1894  	SHRQ  $0x20, AX
  1895  	TESTQ CX, CX
  1896  	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
  1897  	ADDQ  CX, BX
  1898  	CMPQ  BX, $0x40
  1899  	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
  1900  	CMPQ  CX, $0x40
  1901  	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
  1902  	NEGQ  CX
  1903  	SHRQ  CL, R14
  1904  	ADDQ  R14, AX
  1905  
  1906  sequenceDecs_decodeSync_amd64_ml_update_zero:
  1907  	MOVQ AX, 16(SP)
  1908  
  1909  	// Fill bitreader to have enough for the remaining
  1910  	CMPQ SI, $0x08
  1911  	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1912  	MOVQ BX, AX
  1913  	SHRQ $0x03, AX
  1914  	SUBQ AX, R13
  1915  	MOVQ (R13), DX
  1916  	SUBQ AX, SI
  1917  	ANDQ $0x07, BX
  1918  	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
  1919  
  1920  sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
  1921  	CMPQ    SI, $0x00
  1922  	JLE     sequenceDecs_decodeSync_amd64_fill_2_check_overread
  1923  	CMPQ    BX, $0x07
  1924  	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
  1925  	SHLQ    $0x08, DX
  1926  	SUBQ    $0x01, R13
  1927  	SUBQ    $0x01, SI
  1928  	SUBQ    $0x08, BX
  1929  	MOVBQZX (R13), AX
  1930  	ORQ     AX, DX
  1931  	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1932  
  1933  sequenceDecs_decodeSync_amd64_fill_2_check_overread:
  1934  	CMPQ BX, $0x40
  1935  	JA   error_overread
  1936  
  1937  sequenceDecs_decodeSync_amd64_fill_2_end:
  1938  	// Update literal length
  1939  	MOVQ  DI, AX
  1940  	MOVQ  BX, CX
  1941  	MOVQ  DX, R14
  1942  	SHLQ  CL, R14
  1943  	MOVB  AH, CL
  1944  	SHRQ  $0x20, AX
  1945  	TESTQ CX, CX
  1946  	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
  1947  	ADDQ  CX, BX
  1948  	CMPQ  BX, $0x40
  1949  	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
  1950  	CMPQ  CX, $0x40
  1951  	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
  1952  	NEGQ  CX
  1953  	SHRQ  CL, R14
  1954  	ADDQ  R14, AX
  1955  
  1956  sequenceDecs_decodeSync_amd64_ll_update_zero:
  1957  	MOVQ AX, 24(SP)
  1958  
  1959  	// Fill bitreader for state updates
  1960  	MOVQ    R13, (SP)
  1961  	MOVQ    R9, AX
  1962  	SHRQ    $0x08, AX
  1963  	MOVBQZX AL, AX
  1964  	MOVQ    ctx+16(FP), CX
  1965  	CMPQ    96(CX), $0x00
  1966  	JZ      sequenceDecs_decodeSync_amd64_skip_update
  1967  
  1968  	// Update Literal Length State
  1969  	MOVBQZX DI, R13
  1970  	SHRQ    $0x10, DI
  1971  	MOVWQZX DI, DI
  1972  	LEAQ    (BX)(R13*1), CX
  1973  	MOVQ    DX, R14
  1974  	MOVQ    CX, BX
  1975  	ROLQ    CL, R14
  1976  	MOVL    $0x00000001, R15
  1977  	MOVB    R13, CL
  1978  	SHLL    CL, R15
  1979  	DECL    R15
  1980  	ANDQ    R15, R14
  1981  	ADDQ    R14, DI
  1982  
  1983  	// Load ctx.llTable
  1984  	MOVQ ctx+16(FP), CX
  1985  	MOVQ (CX), CX
  1986  	MOVQ (CX)(DI*8), DI
  1987  
  1988  	// Update Match Length State
  1989  	MOVBQZX R8, R13
  1990  	SHRQ    $0x10, R8
  1991  	MOVWQZX R8, R8
  1992  	LEAQ    (BX)(R13*1), CX
  1993  	MOVQ    DX, R14
  1994  	MOVQ    CX, BX
  1995  	ROLQ    CL, R14
  1996  	MOVL    $0x00000001, R15
  1997  	MOVB    R13, CL
  1998  	SHLL    CL, R15
  1999  	DECL    R15
  2000  	ANDQ    R15, R14
  2001  	ADDQ    R14, R8
  2002  
  2003  	// Load ctx.mlTable
  2004  	MOVQ ctx+16(FP), CX
  2005  	MOVQ 24(CX), CX
  2006  	MOVQ (CX)(R8*8), R8
  2007  
  2008  	// Update Offset State
  2009  	MOVBQZX R9, R13
  2010  	SHRQ    $0x10, R9
  2011  	MOVWQZX R9, R9
  2012  	LEAQ    (BX)(R13*1), CX
  2013  	MOVQ    DX, R14
  2014  	MOVQ    CX, BX
  2015  	ROLQ    CL, R14
  2016  	MOVL    $0x00000001, R15
  2017  	MOVB    R13, CL
  2018  	SHLL    CL, R15
  2019  	DECL    R15
  2020  	ANDQ    R15, R14
  2021  	ADDQ    R14, R9
  2022  
  2023  	// Load ctx.ofTable
  2024  	MOVQ ctx+16(FP), CX
  2025  	MOVQ 48(CX), CX
  2026  	MOVQ (CX)(R9*8), R9
  2027  
  2028  sequenceDecs_decodeSync_amd64_skip_update:
  2029  	// Adjust offset
  2030  	MOVQ   s+0(FP), CX
  2031  	MOVQ   8(SP), R13
  2032  	CMPQ   AX, $0x01
  2033  	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
  2034  	MOVUPS 144(CX), X0
  2035  	MOVQ   R13, 144(CX)
  2036  	MOVUPS X0, 152(CX)
  2037  	JMP    sequenceDecs_decodeSync_amd64_after_adjust
  2038  
  2039  sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
  2040  	CMPQ 24(SP), $0x00000000
  2041  	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
  2042  	INCQ R13
  2043  	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  2044  
  2045  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
  2046  	TESTQ R13, R13
  2047  	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  2048  	MOVQ  144(CX), R13
  2049  	JMP   sequenceDecs_decodeSync_amd64_after_adjust
  2050  
  2051  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
  2052  	MOVQ    R13, AX
  2053  	XORQ    R14, R14
  2054  	MOVQ    $-1, R15
  2055  	CMPQ    R13, $0x03
  2056  	CMOVQEQ R14, AX
  2057  	CMOVQEQ R15, R14
  2058  	ADDQ    144(CX)(AX*8), R14
  2059  	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
  2060  	MOVQ    $0x00000001, R14
  2061  
  2062  sequenceDecs_decodeSync_amd64_adjust_temp_valid:
  2063  	CMPQ R13, $0x01
  2064  	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
  2065  	MOVQ 152(CX), AX
  2066  	MOVQ AX, 160(CX)
  2067  
  2068  sequenceDecs_decodeSync_amd64_adjust_skip:
  2069  	MOVQ 144(CX), AX
  2070  	MOVQ AX, 152(CX)
  2071  	MOVQ R14, 144(CX)
  2072  	MOVQ R14, R13
  2073  
  2074  sequenceDecs_decodeSync_amd64_after_adjust:
  2075  	MOVQ R13, 8(SP)
  2076  
  2077  	// Check values
  2078  	MOVQ  16(SP), AX
  2079  	MOVQ  24(SP), CX
  2080  	LEAQ  (AX)(CX*1), R14
  2081  	MOVQ  s+0(FP), R15
  2082  	ADDQ  R14, 256(R15)
  2083  	MOVQ  ctx+16(FP), R14
  2084  	SUBQ  CX, 104(R14)
  2085  	JS    error_not_enough_literals
  2086  	CMPQ  AX, $0x00020002
  2087  	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
  2088  	TESTQ R13, R13
  2089  	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
  2090  	TESTQ AX, AX
  2091  	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
  2092  
  2093  sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
  2094  	MOVQ 24(SP), AX
  2095  	MOVQ 8(SP), CX
  2096  	MOVQ 16(SP), R13
  2097  
  2098  	// Check if we have enough space in s.out
  2099  	LEAQ (AX)(R13*1), R14
  2100  	ADDQ R10, R14
  2101  	CMPQ R14, 32(SP)
  2102  	JA   error_not_enough_space
  2103  
  2104  	// Copy literals
  2105  	TESTQ AX, AX
  2106  	JZ    check_offset
  2107  	XORQ  R14, R14
  2108  
  2109  copy_1:
  2110  	MOVUPS (R11)(R14*1), X0
  2111  	MOVUPS X0, (R10)(R14*1)
  2112  	ADDQ   $0x10, R14
  2113  	CMPQ   R14, AX
  2114  	JB     copy_1
  2115  	ADDQ   AX, R11
  2116  	ADDQ   AX, R10
  2117  	ADDQ   AX, R12
  2118  
  2119  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2120  check_offset:
  2121  	MOVQ R12, AX
  2122  	ADDQ 40(SP), AX
  2123  	CMPQ CX, AX
  2124  	JG   error_match_off_too_big
  2125  	CMPQ CX, 56(SP)
  2126  	JG   error_match_off_too_big
  2127  
  2128  	// Copy match from history
  2129  	MOVQ CX, AX
  2130  	SUBQ R12, AX
  2131  	JLS  copy_match
  2132  	MOVQ 48(SP), R14
  2133  	SUBQ AX, R14
  2134  	CMPQ R13, AX
  2135  	JG   copy_all_from_history
  2136  	MOVQ R13, AX
  2137  	SUBQ $0x10, AX
  2138  	JB   copy_4_small
  2139  
  2140  copy_4_loop:
  2141  	MOVUPS (R14), X0
  2142  	MOVUPS X0, (R10)
  2143  	ADDQ   $0x10, R14
  2144  	ADDQ   $0x10, R10
  2145  	SUBQ   $0x10, AX
  2146  	JAE    copy_4_loop
  2147  	LEAQ   16(R14)(AX*1), R14
  2148  	LEAQ   16(R10)(AX*1), R10
  2149  	MOVUPS -16(R14), X0
  2150  	MOVUPS X0, -16(R10)
  2151  	JMP    copy_4_end
  2152  
  2153  copy_4_small:
  2154  	CMPQ R13, $0x03
  2155  	JE   copy_4_move_3
  2156  	CMPQ R13, $0x08
  2157  	JB   copy_4_move_4through7
  2158  	JMP  copy_4_move_8through16
  2159  
  2160  copy_4_move_3:
  2161  	MOVW (R14), AX
  2162  	MOVB 2(R14), CL
  2163  	MOVW AX, (R10)
  2164  	MOVB CL, 2(R10)
  2165  	ADDQ R13, R14
  2166  	ADDQ R13, R10
  2167  	JMP  copy_4_end
  2168  
  2169  copy_4_move_4through7:
  2170  	MOVL (R14), AX
  2171  	MOVL -4(R14)(R13*1), CX
  2172  	MOVL AX, (R10)
  2173  	MOVL CX, -4(R10)(R13*1)
  2174  	ADDQ R13, R14
  2175  	ADDQ R13, R10
  2176  	JMP  copy_4_end
  2177  
  2178  copy_4_move_8through16:
  2179  	MOVQ (R14), AX
  2180  	MOVQ -8(R14)(R13*1), CX
  2181  	MOVQ AX, (R10)
  2182  	MOVQ CX, -8(R10)(R13*1)
  2183  	ADDQ R13, R14
  2184  	ADDQ R13, R10
  2185  
  2186  copy_4_end:
  2187  	ADDQ R13, R12
  2188  	JMP  handle_loop
  2189  	JMP loop_finished
  2190  
  2191  copy_all_from_history:
  2192  	MOVQ AX, R15
  2193  	SUBQ $0x10, R15
  2194  	JB   copy_5_small
  2195  
  2196  copy_5_loop:
  2197  	MOVUPS (R14), X0
  2198  	MOVUPS X0, (R10)
  2199  	ADDQ   $0x10, R14
  2200  	ADDQ   $0x10, R10
  2201  	SUBQ   $0x10, R15
  2202  	JAE    copy_5_loop
  2203  	LEAQ   16(R14)(R15*1), R14
  2204  	LEAQ   16(R10)(R15*1), R10
  2205  	MOVUPS -16(R14), X0
  2206  	MOVUPS X0, -16(R10)
  2207  	JMP    copy_5_end
  2208  
  2209  copy_5_small:
  2210  	CMPQ AX, $0x03
  2211  	JE   copy_5_move_3
  2212  	JB   copy_5_move_1or2
  2213  	CMPQ AX, $0x08
  2214  	JB   copy_5_move_4through7
  2215  	JMP  copy_5_move_8through16
  2216  
  2217  copy_5_move_1or2:
  2218  	MOVB (R14), R15
  2219  	MOVB -1(R14)(AX*1), BP
  2220  	MOVB R15, (R10)
  2221  	MOVB BP, -1(R10)(AX*1)
  2222  	ADDQ AX, R14
  2223  	ADDQ AX, R10
  2224  	JMP  copy_5_end
  2225  
  2226  copy_5_move_3:
  2227  	MOVW (R14), R15
  2228  	MOVB 2(R14), BP
  2229  	MOVW R15, (R10)
  2230  	MOVB BP, 2(R10)
  2231  	ADDQ AX, R14
  2232  	ADDQ AX, R10
  2233  	JMP  copy_5_end
  2234  
  2235  copy_5_move_4through7:
  2236  	MOVL (R14), R15
  2237  	MOVL -4(R14)(AX*1), BP
  2238  	MOVL R15, (R10)
  2239  	MOVL BP, -4(R10)(AX*1)
  2240  	ADDQ AX, R14
  2241  	ADDQ AX, R10
  2242  	JMP  copy_5_end
  2243  
  2244  copy_5_move_8through16:
  2245  	MOVQ (R14), R15
  2246  	MOVQ -8(R14)(AX*1), BP
  2247  	MOVQ R15, (R10)
  2248  	MOVQ BP, -8(R10)(AX*1)
  2249  	ADDQ AX, R14
  2250  	ADDQ AX, R10
  2251  
  2252  copy_5_end:
  2253  	ADDQ AX, R12
  2254  	SUBQ AX, R13
  2255  
  2256  	// Copy match from the current buffer
  2257  copy_match:
  2258  	MOVQ R10, AX
  2259  	SUBQ CX, AX
  2260  
  2261  	// ml <= mo
  2262  	CMPQ R13, CX
  2263  	JA   copy_overlapping_match
  2264  
  2265  	// Copy non-overlapping match
  2266  	ADDQ R13, R12
  2267  	MOVQ R10, CX
  2268  	ADDQ R13, R10
  2269  
  2270  copy_2:
  2271  	MOVUPS (AX), X0
  2272  	MOVUPS X0, (CX)
  2273  	ADDQ   $0x10, AX
  2274  	ADDQ   $0x10, CX
  2275  	SUBQ   $0x10, R13
  2276  	JHI    copy_2
  2277  	JMP    handle_loop
  2278  
  2279  	// Copy overlapping match
  2280  copy_overlapping_match:
  2281  	ADDQ R13, R12
  2282  
  2283  copy_slow_3:
  2284  	MOVB (AX), CL
  2285  	MOVB CL, (R10)
  2286  	INCQ AX
  2287  	INCQ R10
  2288  	DECQ R13
  2289  	JNZ  copy_slow_3
  2290  
  2291  handle_loop:
  2292  	MOVQ ctx+16(FP), AX
  2293  	DECQ 96(AX)
  2294  	JNS  sequenceDecs_decodeSync_amd64_main_loop
  2295  
  2296  loop_finished:
  2297  	MOVQ br+8(FP), AX
  2298  	MOVQ DX, 32(AX)
  2299  	MOVB BL, 40(AX)
  2300  	MOVQ SI, 24(AX)
  2301  
  2302  	// Update the context
  2303  	MOVQ ctx+16(FP), AX
  2304  	MOVQ R12, 136(AX)
  2305  	MOVQ 144(AX), CX
  2306  	SUBQ CX, R11
  2307  	MOVQ R11, 168(AX)
  2308  
  2309  	// Return success
  2310  	MOVQ $0x00000000, ret+24(FP)
  2311  	RET
  2312  
  2313  	// Return with match length error
  2314  sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
  2315  	MOVQ 16(SP), AX
  2316  	MOVQ ctx+16(FP), CX
  2317  	MOVQ AX, 216(CX)
  2318  	MOVQ $0x00000001, ret+24(FP)
  2319  	RET
  2320  
  2321  	// Return with match too long error
  2322  sequenceDecs_decodeSync_amd64_error_match_len_too_big:
  2323  	MOVQ ctx+16(FP), AX
  2324  	MOVQ 16(SP), CX
  2325  	MOVQ CX, 216(AX)
  2326  	MOVQ $0x00000002, ret+24(FP)
  2327  	RET
  2328  
  2329  	// Return with match offset too long error
  2330  error_match_off_too_big:
  2331  	MOVQ ctx+16(FP), AX
  2332  	MOVQ 8(SP), CX
  2333  	MOVQ CX, 224(AX)
  2334  	MOVQ R12, 136(AX)
  2335  	MOVQ $0x00000003, ret+24(FP)
  2336  	RET
  2337  
  2338  	// Return with not enough literals error
  2339  error_not_enough_literals:
  2340  	MOVQ ctx+16(FP), AX
  2341  	MOVQ 24(SP), CX
  2342  	MOVQ CX, 208(AX)
  2343  	MOVQ $0x00000004, ret+24(FP)
  2344  	RET
  2345  
  2346  	// Return with overread error
  2347  error_overread:
  2348  	MOVQ $0x00000006, ret+24(FP)
  2349  	RET
  2350  
  2351  	// Return with not enough output space error
  2352  error_not_enough_space:
  2353  	MOVQ ctx+16(FP), AX
  2354  	MOVQ 24(SP), CX
  2355  	MOVQ CX, 208(AX)
  2356  	MOVQ 16(SP), CX
  2357  	MOVQ CX, 216(AX)
  2358  	MOVQ R12, 136(AX)
  2359  	MOVQ $0x00000005, ret+24(FP)
  2360  	RET
  2361  
  2362  // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2363  // Requires: BMI, BMI2, CMOV, SSE
  2364  TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
  2365  	MOVQ    br+8(FP), CX
  2366  	MOVQ    32(CX), AX
  2367  	MOVBQZX 40(CX), DX
  2368  	MOVQ    24(CX), BX
  2369  	MOVQ    (CX), CX
  2370  	ADDQ    BX, CX
  2371  	MOVQ    CX, (SP)
  2372  	MOVQ    ctx+16(FP), CX
  2373  	MOVQ    72(CX), SI
  2374  	MOVQ    80(CX), DI
  2375  	MOVQ    88(CX), R8
  2376  	XORQ    R9, R9
  2377  	MOVQ    R9, 8(SP)
  2378  	MOVQ    R9, 16(SP)
  2379  	MOVQ    R9, 24(SP)
  2380  	MOVQ    112(CX), R9
  2381  	MOVQ    128(CX), R10
  2382  	MOVQ    R10, 32(SP)
  2383  	MOVQ    144(CX), R10
  2384  	MOVQ    136(CX), R11
  2385  	MOVQ    200(CX), R12
  2386  	MOVQ    R12, 56(SP)
  2387  	MOVQ    176(CX), R12
  2388  	MOVQ    R12, 48(SP)
  2389  	MOVQ    184(CX), CX
  2390  	MOVQ    CX, 40(SP)
  2391  	MOVQ    40(SP), CX
  2392  	ADDQ    CX, 48(SP)
  2393  
  2394  	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2395  	ADDQ R9, 32(SP)
  2396  
  2397  	// outBase += outPosition
  2398  	ADDQ R11, R9
  2399  
  2400  sequenceDecs_decodeSync_bmi2_main_loop:
  2401  	MOVQ (SP), R12
  2402  
  2403  	// Fill bitreader to have enough for the offset and match length.
  2404  	CMPQ BX, $0x08
  2405  	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2406  	MOVQ DX, CX
  2407  	SHRQ $0x03, CX
  2408  	SUBQ CX, R12
  2409  	MOVQ (R12), AX
  2410  	SUBQ CX, BX
  2411  	ANDQ $0x07, DX
  2412  	JMP  sequenceDecs_decodeSync_bmi2_fill_end
  2413  
  2414  sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
  2415  	CMPQ    BX, $0x00
  2416  	JLE     sequenceDecs_decodeSync_bmi2_fill_check_overread
  2417  	CMPQ    DX, $0x07
  2418  	JLE     sequenceDecs_decodeSync_bmi2_fill_end
  2419  	SHLQ    $0x08, AX
  2420  	SUBQ    $0x01, R12
  2421  	SUBQ    $0x01, BX
  2422  	SUBQ    $0x08, DX
  2423  	MOVBQZX (R12), CX
  2424  	ORQ     CX, AX
  2425  	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2426  
  2427  sequenceDecs_decodeSync_bmi2_fill_check_overread:
  2428  	CMPQ DX, $0x40
  2429  	JA   error_overread
  2430  
  2431  sequenceDecs_decodeSync_bmi2_fill_end:
  2432  	// Update offset
  2433  	MOVQ   $0x00000808, CX
  2434  	BEXTRQ CX, R8, R13
  2435  	MOVQ   AX, R14
  2436  	LEAQ   (DX)(R13*1), CX
  2437  	ROLQ   CL, R14
  2438  	BZHIQ  R13, R14, R14
  2439  	MOVQ   CX, DX
  2440  	MOVQ   R8, CX
  2441  	SHRQ   $0x20, CX
  2442  	ADDQ   R14, CX
  2443  	MOVQ   CX, 8(SP)
  2444  
  2445  	// Update match length
  2446  	MOVQ   $0x00000808, CX
  2447  	BEXTRQ CX, DI, R13
  2448  	MOVQ   AX, R14
  2449  	LEAQ   (DX)(R13*1), CX
  2450  	ROLQ   CL, R14
  2451  	BZHIQ  R13, R14, R14
  2452  	MOVQ   CX, DX
  2453  	MOVQ   DI, CX
  2454  	SHRQ   $0x20, CX
  2455  	ADDQ   R14, CX
  2456  	MOVQ   CX, 16(SP)
  2457  
  2458  	// Fill bitreader to have enough for the remaining
  2459  	CMPQ BX, $0x08
  2460  	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2461  	MOVQ DX, CX
  2462  	SHRQ $0x03, CX
  2463  	SUBQ CX, R12
  2464  	MOVQ (R12), AX
  2465  	SUBQ CX, BX
  2466  	ANDQ $0x07, DX
  2467  	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
  2468  
  2469  sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
  2470  	CMPQ    BX, $0x00
  2471  	JLE     sequenceDecs_decodeSync_bmi2_fill_2_check_overread
  2472  	CMPQ    DX, $0x07
  2473  	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
  2474  	SHLQ    $0x08, AX
  2475  	SUBQ    $0x01, R12
  2476  	SUBQ    $0x01, BX
  2477  	SUBQ    $0x08, DX
  2478  	MOVBQZX (R12), CX
  2479  	ORQ     CX, AX
  2480  	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2481  
  2482  sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
  2483  	CMPQ DX, $0x40
  2484  	JA   error_overread
  2485  
  2486  sequenceDecs_decodeSync_bmi2_fill_2_end:
  2487  	// Update literal length
  2488  	MOVQ   $0x00000808, CX
  2489  	BEXTRQ CX, SI, R13
  2490  	MOVQ   AX, R14
  2491  	LEAQ   (DX)(R13*1), CX
  2492  	ROLQ   CL, R14
  2493  	BZHIQ  R13, R14, R14
  2494  	MOVQ   CX, DX
  2495  	MOVQ   SI, CX
  2496  	SHRQ   $0x20, CX
  2497  	ADDQ   R14, CX
  2498  	MOVQ   CX, 24(SP)
  2499  
  2500  	// Fill bitreader for state updates
  2501  	MOVQ    R12, (SP)
  2502  	MOVQ    $0x00000808, CX
  2503  	BEXTRQ  CX, R8, R12
  2504  	MOVQ    ctx+16(FP), CX
  2505  	CMPQ    96(CX), $0x00
  2506  	JZ      sequenceDecs_decodeSync_bmi2_skip_update
  2507  	LEAQ    (SI)(DI*1), R13
  2508  	ADDQ    R8, R13
  2509  	MOVBQZX R13, R13
  2510  	LEAQ    (DX)(R13*1), CX
  2511  	MOVQ    AX, R14
  2512  	MOVQ    CX, DX
  2513  	ROLQ    CL, R14
  2514  	BZHIQ   R13, R14, R14
  2515  
  2516  	// Update Offset State
  2517  	BZHIQ  R8, R14, CX
  2518  	SHRXQ  R8, R14, R14
  2519  	MOVQ   $0x00001010, R13
  2520  	BEXTRQ R13, R8, R8
  2521  	ADDQ   CX, R8
  2522  
  2523  	// Load ctx.ofTable
  2524  	MOVQ ctx+16(FP), CX
  2525  	MOVQ 48(CX), CX
  2526  	MOVQ (CX)(R8*8), R8
  2527  
  2528  	// Update Match Length State
  2529  	BZHIQ  DI, R14, CX
  2530  	SHRXQ  DI, R14, R14
  2531  	MOVQ   $0x00001010, R13
  2532  	BEXTRQ R13, DI, DI
  2533  	ADDQ   CX, DI
  2534  
  2535  	// Load ctx.mlTable
  2536  	MOVQ ctx+16(FP), CX
  2537  	MOVQ 24(CX), CX
  2538  	MOVQ (CX)(DI*8), DI
  2539  
  2540  	// Update Literal Length State
  2541  	BZHIQ  SI, R14, CX
  2542  	MOVQ   $0x00001010, R13
  2543  	BEXTRQ R13, SI, SI
  2544  	ADDQ   CX, SI
  2545  
  2546  	// Load ctx.llTable
  2547  	MOVQ ctx+16(FP), CX
  2548  	MOVQ (CX), CX
  2549  	MOVQ (CX)(SI*8), SI
  2550  
  2551  sequenceDecs_decodeSync_bmi2_skip_update:
  2552  	// Adjust offset
  2553  	MOVQ   s+0(FP), CX
  2554  	MOVQ   8(SP), R13
  2555  	CMPQ   R12, $0x01
  2556  	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
  2557  	MOVUPS 144(CX), X0
  2558  	MOVQ   R13, 144(CX)
  2559  	MOVUPS X0, 152(CX)
  2560  	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
  2561  
  2562  sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
  2563  	CMPQ 24(SP), $0x00000000
  2564  	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
  2565  	INCQ R13
  2566  	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2567  
  2568  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
  2569  	TESTQ R13, R13
  2570  	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2571  	MOVQ  144(CX), R13
  2572  	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
  2573  
  2574  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
  2575  	MOVQ    R13, R12
  2576  	XORQ    R14, R14
  2577  	MOVQ    $-1, R15
  2578  	CMPQ    R13, $0x03
  2579  	CMOVQEQ R14, R12
  2580  	CMOVQEQ R15, R14
  2581  	ADDQ    144(CX)(R12*8), R14
  2582  	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
  2583  	MOVQ    $0x00000001, R14
  2584  
  2585  sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
  2586  	CMPQ R13, $0x01
  2587  	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
  2588  	MOVQ 152(CX), R12
  2589  	MOVQ R12, 160(CX)
  2590  
  2591  sequenceDecs_decodeSync_bmi2_adjust_skip:
  2592  	MOVQ 144(CX), R12
  2593  	MOVQ R12, 152(CX)
  2594  	MOVQ R14, 144(CX)
  2595  	MOVQ R14, R13
  2596  
  2597  sequenceDecs_decodeSync_bmi2_after_adjust:
  2598  	MOVQ R13, 8(SP)
  2599  
  2600  	// Check values
  2601  	MOVQ  16(SP), CX
  2602  	MOVQ  24(SP), R12
  2603  	LEAQ  (CX)(R12*1), R14
  2604  	MOVQ  s+0(FP), R15
  2605  	ADDQ  R14, 256(R15)
  2606  	MOVQ  ctx+16(FP), R14
  2607  	SUBQ  R12, 104(R14)
  2608  	JS    error_not_enough_literals
  2609  	CMPQ  CX, $0x00020002
  2610  	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
  2611  	TESTQ R13, R13
  2612  	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
  2613  	TESTQ CX, CX
  2614  	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
  2615  
  2616  sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
  2617  	MOVQ 24(SP), CX
  2618  	MOVQ 8(SP), R12
  2619  	MOVQ 16(SP), R13
  2620  
  2621  	// Check if we have enough space in s.out
  2622  	LEAQ (CX)(R13*1), R14
  2623  	ADDQ R9, R14
  2624  	CMPQ R14, 32(SP)
  2625  	JA   error_not_enough_space
  2626  
  2627  	// Copy literals
  2628  	TESTQ CX, CX
  2629  	JZ    check_offset
  2630  	XORQ  R14, R14
  2631  
  2632  copy_1:
  2633  	MOVUPS (R10)(R14*1), X0
  2634  	MOVUPS X0, (R9)(R14*1)
  2635  	ADDQ   $0x10, R14
  2636  	CMPQ   R14, CX
  2637  	JB     copy_1
  2638  	ADDQ   CX, R10
  2639  	ADDQ   CX, R9
  2640  	ADDQ   CX, R11
  2641  
  2642  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2643  check_offset:
  2644  	MOVQ R11, CX
  2645  	ADDQ 40(SP), CX
  2646  	CMPQ R12, CX
  2647  	JG   error_match_off_too_big
  2648  	CMPQ R12, 56(SP)
  2649  	JG   error_match_off_too_big
  2650  
  2651  	// Copy match from history
  2652  	MOVQ R12, CX
  2653  	SUBQ R11, CX
  2654  	JLS  copy_match
  2655  	MOVQ 48(SP), R14
  2656  	SUBQ CX, R14
  2657  	CMPQ R13, CX
  2658  	JG   copy_all_from_history
  2659  	MOVQ R13, CX
  2660  	SUBQ $0x10, CX
  2661  	JB   copy_4_small
  2662  
  2663  copy_4_loop:
  2664  	MOVUPS (R14), X0
  2665  	MOVUPS X0, (R9)
  2666  	ADDQ   $0x10, R14
  2667  	ADDQ   $0x10, R9
  2668  	SUBQ   $0x10, CX
  2669  	JAE    copy_4_loop
  2670  	LEAQ   16(R14)(CX*1), R14
  2671  	LEAQ   16(R9)(CX*1), R9
  2672  	MOVUPS -16(R14), X0
  2673  	MOVUPS X0, -16(R9)
  2674  	JMP    copy_4_end
  2675  
  2676  copy_4_small:
  2677  	CMPQ R13, $0x03
  2678  	JE   copy_4_move_3
  2679  	CMPQ R13, $0x08
  2680  	JB   copy_4_move_4through7
  2681  	JMP  copy_4_move_8through16
  2682  
  2683  copy_4_move_3:
  2684  	MOVW (R14), CX
  2685  	MOVB 2(R14), R12
  2686  	MOVW CX, (R9)
  2687  	MOVB R12, 2(R9)
  2688  	ADDQ R13, R14
  2689  	ADDQ R13, R9
  2690  	JMP  copy_4_end
  2691  
  2692  copy_4_move_4through7:
  2693  	MOVL (R14), CX
  2694  	MOVL -4(R14)(R13*1), R12
  2695  	MOVL CX, (R9)
  2696  	MOVL R12, -4(R9)(R13*1)
  2697  	ADDQ R13, R14
  2698  	ADDQ R13, R9
  2699  	JMP  copy_4_end
  2700  
  2701  copy_4_move_8through16:
  2702  	MOVQ (R14), CX
  2703  	MOVQ -8(R14)(R13*1), R12
  2704  	MOVQ CX, (R9)
  2705  	MOVQ R12, -8(R9)(R13*1)
  2706  	ADDQ R13, R14
  2707  	ADDQ R13, R9
  2708  
  2709  copy_4_end:
  2710  	ADDQ R13, R11
  2711  	JMP  handle_loop
  2712  	JMP loop_finished
  2713  
  2714  copy_all_from_history:
  2715  	MOVQ CX, R15
  2716  	SUBQ $0x10, R15
  2717  	JB   copy_5_small
  2718  
  2719  copy_5_loop:
  2720  	MOVUPS (R14), X0
  2721  	MOVUPS X0, (R9)
  2722  	ADDQ   $0x10, R14
  2723  	ADDQ   $0x10, R9
  2724  	SUBQ   $0x10, R15
  2725  	JAE    copy_5_loop
  2726  	LEAQ   16(R14)(R15*1), R14
  2727  	LEAQ   16(R9)(R15*1), R9
  2728  	MOVUPS -16(R14), X0
  2729  	MOVUPS X0, -16(R9)
  2730  	JMP    copy_5_end
  2731  
  2732  copy_5_small:
  2733  	CMPQ CX, $0x03
  2734  	JE   copy_5_move_3
  2735  	JB   copy_5_move_1or2
  2736  	CMPQ CX, $0x08
  2737  	JB   copy_5_move_4through7
  2738  	JMP  copy_5_move_8through16
  2739  
  2740  copy_5_move_1or2:
  2741  	MOVB (R14), R15
  2742  	MOVB -1(R14)(CX*1), BP
  2743  	MOVB R15, (R9)
  2744  	MOVB BP, -1(R9)(CX*1)
  2745  	ADDQ CX, R14
  2746  	ADDQ CX, R9
  2747  	JMP  copy_5_end
  2748  
  2749  copy_5_move_3:
  2750  	MOVW (R14), R15
  2751  	MOVB 2(R14), BP
  2752  	MOVW R15, (R9)
  2753  	MOVB BP, 2(R9)
  2754  	ADDQ CX, R14
  2755  	ADDQ CX, R9
  2756  	JMP  copy_5_end
  2757  
  2758  copy_5_move_4through7:
  2759  	MOVL (R14), R15
  2760  	MOVL -4(R14)(CX*1), BP
  2761  	MOVL R15, (R9)
  2762  	MOVL BP, -4(R9)(CX*1)
  2763  	ADDQ CX, R14
  2764  	ADDQ CX, R9
  2765  	JMP  copy_5_end
  2766  
  2767  copy_5_move_8through16:
  2768  	MOVQ (R14), R15
  2769  	MOVQ -8(R14)(CX*1), BP
  2770  	MOVQ R15, (R9)
  2771  	MOVQ BP, -8(R9)(CX*1)
  2772  	ADDQ CX, R14
  2773  	ADDQ CX, R9
  2774  
  2775  copy_5_end:
  2776  	ADDQ CX, R11
  2777  	SUBQ CX, R13
  2778  
  2779  	// Copy match from the current buffer
  2780  copy_match:
  2781  	MOVQ R9, CX
  2782  	SUBQ R12, CX
  2783  
  2784  	// ml <= mo
  2785  	CMPQ R13, R12
  2786  	JA   copy_overlapping_match
  2787  
  2788  	// Copy non-overlapping match
  2789  	ADDQ R13, R11
  2790  	MOVQ R9, R12
  2791  	ADDQ R13, R9
  2792  
  2793  copy_2:
  2794  	MOVUPS (CX), X0
  2795  	MOVUPS X0, (R12)
  2796  	ADDQ   $0x10, CX
  2797  	ADDQ   $0x10, R12
  2798  	SUBQ   $0x10, R13
  2799  	JHI    copy_2
  2800  	JMP    handle_loop
  2801  
  2802  	// Copy overlapping match
  2803  copy_overlapping_match:
  2804  	ADDQ R13, R11
  2805  
  2806  copy_slow_3:
  2807  	MOVB (CX), R12
  2808  	MOVB R12, (R9)
  2809  	INCQ CX
  2810  	INCQ R9
  2811  	DECQ R13
  2812  	JNZ  copy_slow_3
  2813  
  2814  handle_loop:
  2815  	MOVQ ctx+16(FP), CX
  2816  	DECQ 96(CX)
  2817  	JNS  sequenceDecs_decodeSync_bmi2_main_loop
  2818  
  2819  loop_finished:
  2820  	MOVQ br+8(FP), CX
  2821  	MOVQ AX, 32(CX)
  2822  	MOVB DL, 40(CX)
  2823  	MOVQ BX, 24(CX)
  2824  
  2825  	// Update the context
  2826  	MOVQ ctx+16(FP), AX
  2827  	MOVQ R11, 136(AX)
  2828  	MOVQ 144(AX), CX
  2829  	SUBQ CX, R10
  2830  	MOVQ R10, 168(AX)
  2831  
  2832  	// Return success
  2833  	MOVQ $0x00000000, ret+24(FP)
  2834  	RET
  2835  
  2836  	// Return with match length error
  2837  sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
  2838  	MOVQ 16(SP), AX
  2839  	MOVQ ctx+16(FP), CX
  2840  	MOVQ AX, 216(CX)
  2841  	MOVQ $0x00000001, ret+24(FP)
  2842  	RET
  2843  
  2844  	// Return with match too long error
  2845  sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
  2846  	MOVQ ctx+16(FP), AX
  2847  	MOVQ 16(SP), CX
  2848  	MOVQ CX, 216(AX)
  2849  	MOVQ $0x00000002, ret+24(FP)
  2850  	RET
  2851  
  2852  	// Return with match offset too long error
  2853  error_match_off_too_big:
  2854  	MOVQ ctx+16(FP), AX
  2855  	MOVQ 8(SP), CX
  2856  	MOVQ CX, 224(AX)
  2857  	MOVQ R11, 136(AX)
  2858  	MOVQ $0x00000003, ret+24(FP)
  2859  	RET
  2860  
  2861  	// Return with not enough literals error
  2862  error_not_enough_literals:
  2863  	MOVQ ctx+16(FP), AX
  2864  	MOVQ 24(SP), CX
  2865  	MOVQ CX, 208(AX)
  2866  	MOVQ $0x00000004, ret+24(FP)
  2867  	RET
  2868  
  2869  	// Return with overread error
  2870  error_overread:
  2871  	MOVQ $0x00000006, ret+24(FP)
  2872  	RET
  2873  
  2874  	// Return with not enough output space error
  2875  error_not_enough_space:
  2876  	MOVQ ctx+16(FP), AX
  2877  	MOVQ 24(SP), CX
  2878  	MOVQ CX, 208(AX)
  2879  	MOVQ 16(SP), CX
  2880  	MOVQ CX, 216(AX)
  2881  	MOVQ R11, 136(AX)
  2882  	MOVQ $0x00000005, ret+24(FP)
  2883  	RET
  2884  
  2885  // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2886  // Requires: CMOV, SSE
  2887  TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
  2888  	MOVQ    br+8(FP), AX
  2889  	MOVQ    32(AX), DX
  2890  	MOVBQZX 40(AX), BX
  2891  	MOVQ    24(AX), SI
  2892  	MOVQ    (AX), AX
  2893  	ADDQ    SI, AX
  2894  	MOVQ    AX, (SP)
  2895  	MOVQ    ctx+16(FP), AX
  2896  	MOVQ    72(AX), DI
  2897  	MOVQ    80(AX), R8
  2898  	MOVQ    88(AX), R9
  2899  	XORQ    CX, CX
  2900  	MOVQ    CX, 8(SP)
  2901  	MOVQ    CX, 16(SP)
  2902  	MOVQ    CX, 24(SP)
  2903  	MOVQ    112(AX), R10
  2904  	MOVQ    128(AX), CX
  2905  	MOVQ    CX, 32(SP)
  2906  	MOVQ    144(AX), R11
  2907  	MOVQ    136(AX), R12
  2908  	MOVQ    200(AX), CX
  2909  	MOVQ    CX, 56(SP)
  2910  	MOVQ    176(AX), CX
  2911  	MOVQ    CX, 48(SP)
  2912  	MOVQ    184(AX), AX
  2913  	MOVQ    AX, 40(SP)
  2914  	MOVQ    40(SP), AX
  2915  	ADDQ    AX, 48(SP)
  2916  
  2917  	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2918  	ADDQ R10, 32(SP)
  2919  
  2920  	// outBase += outPosition
  2921  	ADDQ R12, R10
  2922  
  2923  sequenceDecs_decodeSync_safe_amd64_main_loop:
  2924  	MOVQ (SP), R13
  2925  
  2926  	// Fill bitreader to have enough for the offset and match length.
  2927  	CMPQ SI, $0x08
  2928  	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2929  	MOVQ BX, AX
  2930  	SHRQ $0x03, AX
  2931  	SUBQ AX, R13
  2932  	MOVQ (R13), DX
  2933  	SUBQ AX, SI
  2934  	ANDQ $0x07, BX
  2935  	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
  2936  
  2937  sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
  2938  	CMPQ    SI, $0x00
  2939  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_check_overread
  2940  	CMPQ    BX, $0x07
  2941  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
  2942  	SHLQ    $0x08, DX
  2943  	SUBQ    $0x01, R13
  2944  	SUBQ    $0x01, SI
  2945  	SUBQ    $0x08, BX
  2946  	MOVBQZX (R13), AX
  2947  	ORQ     AX, DX
  2948  	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2949  
  2950  sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
  2951  	CMPQ BX, $0x40
  2952  	JA   error_overread
  2953  
  2954  sequenceDecs_decodeSync_safe_amd64_fill_end:
  2955  	// Update offset
  2956  	MOVQ  R9, AX
  2957  	MOVQ  BX, CX
  2958  	MOVQ  DX, R14
  2959  	SHLQ  CL, R14
  2960  	MOVB  AH, CL
  2961  	SHRQ  $0x20, AX
  2962  	TESTQ CX, CX
  2963  	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2964  	ADDQ  CX, BX
  2965  	CMPQ  BX, $0x40
  2966  	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2967  	CMPQ  CX, $0x40
  2968  	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2969  	NEGQ  CX
  2970  	SHRQ  CL, R14
  2971  	ADDQ  R14, AX
  2972  
  2973  sequenceDecs_decodeSync_safe_amd64_of_update_zero:
  2974  	MOVQ AX, 8(SP)
  2975  
  2976  	// Update match length
  2977  	MOVQ  R8, AX
  2978  	MOVQ  BX, CX
  2979  	MOVQ  DX, R14
  2980  	SHLQ  CL, R14
  2981  	MOVB  AH, CL
  2982  	SHRQ  $0x20, AX
  2983  	TESTQ CX, CX
  2984  	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2985  	ADDQ  CX, BX
  2986  	CMPQ  BX, $0x40
  2987  	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2988  	CMPQ  CX, $0x40
  2989  	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2990  	NEGQ  CX
  2991  	SHRQ  CL, R14
  2992  	ADDQ  R14, AX
  2993  
  2994  sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
  2995  	MOVQ AX, 16(SP)
  2996  
  2997  	// Fill bitreader to have enough for the remaining
  2998  	CMPQ SI, $0x08
  2999  	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  3000  	MOVQ BX, AX
  3001  	SHRQ $0x03, AX
  3002  	SUBQ AX, R13
  3003  	MOVQ (R13), DX
  3004  	SUBQ AX, SI
  3005  	ANDQ $0x07, BX
  3006  	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
  3007  
  3008  sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
  3009  	CMPQ    SI, $0x00
  3010  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
  3011  	CMPQ    BX, $0x07
  3012  	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
  3013  	SHLQ    $0x08, DX
  3014  	SUBQ    $0x01, R13
  3015  	SUBQ    $0x01, SI
  3016  	SUBQ    $0x08, BX
  3017  	MOVBQZX (R13), AX
  3018  	ORQ     AX, DX
  3019  	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  3020  
  3021  sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
  3022  	CMPQ BX, $0x40
  3023  	JA   error_overread
  3024  
  3025  sequenceDecs_decodeSync_safe_amd64_fill_2_end:
  3026  	// Update literal length
  3027  	MOVQ  DI, AX
  3028  	MOVQ  BX, CX
  3029  	MOVQ  DX, R14
  3030  	SHLQ  CL, R14
  3031  	MOVB  AH, CL
  3032  	SHRQ  $0x20, AX
  3033  	TESTQ CX, CX
  3034  	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3035  	ADDQ  CX, BX
  3036  	CMPQ  BX, $0x40
  3037  	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3038  	CMPQ  CX, $0x40
  3039  	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3040  	NEGQ  CX
  3041  	SHRQ  CL, R14
  3042  	ADDQ  R14, AX
  3043  
  3044  sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
  3045  	MOVQ AX, 24(SP)
  3046  
  3047  	// Fill bitreader for state updates
  3048  	MOVQ    R13, (SP)
  3049  	MOVQ    R9, AX
  3050  	SHRQ    $0x08, AX
  3051  	MOVBQZX AL, AX
  3052  	MOVQ    ctx+16(FP), CX
  3053  	CMPQ    96(CX), $0x00
  3054  	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
  3055  
  3056  	// Update Literal Length State
  3057  	MOVBQZX DI, R13
  3058  	SHRQ    $0x10, DI
  3059  	MOVWQZX DI, DI
  3060  	LEAQ    (BX)(R13*1), CX
  3061  	MOVQ    DX, R14
  3062  	MOVQ    CX, BX
  3063  	ROLQ    CL, R14
  3064  	MOVL    $0x00000001, R15
  3065  	MOVB    R13, CL
  3066  	SHLL    CL, R15
  3067  	DECL    R15
  3068  	ANDQ    R15, R14
  3069  	ADDQ    R14, DI
  3070  
  3071  	// Load ctx.llTable
  3072  	MOVQ ctx+16(FP), CX
  3073  	MOVQ (CX), CX
  3074  	MOVQ (CX)(DI*8), DI
  3075  
  3076  	// Update Match Length State
  3077  	MOVBQZX R8, R13
  3078  	SHRQ    $0x10, R8
  3079  	MOVWQZX R8, R8
  3080  	LEAQ    (BX)(R13*1), CX
  3081  	MOVQ    DX, R14
  3082  	MOVQ    CX, BX
  3083  	ROLQ    CL, R14
  3084  	MOVL    $0x00000001, R15
  3085  	MOVB    R13, CL
  3086  	SHLL    CL, R15
  3087  	DECL    R15
  3088  	ANDQ    R15, R14
  3089  	ADDQ    R14, R8
  3090  
  3091  	// Load ctx.mlTable
  3092  	MOVQ ctx+16(FP), CX
  3093  	MOVQ 24(CX), CX
  3094  	MOVQ (CX)(R8*8), R8
  3095  
  3096  	// Update Offset State
  3097  	MOVBQZX R9, R13
  3098  	SHRQ    $0x10, R9
  3099  	MOVWQZX R9, R9
  3100  	LEAQ    (BX)(R13*1), CX
  3101  	MOVQ    DX, R14
  3102  	MOVQ    CX, BX
  3103  	ROLQ    CL, R14
  3104  	MOVL    $0x00000001, R15
  3105  	MOVB    R13, CL
  3106  	SHLL    CL, R15
  3107  	DECL    R15
  3108  	ANDQ    R15, R14
  3109  	ADDQ    R14, R9
  3110  
  3111  	// Load ctx.ofTable
  3112  	MOVQ ctx+16(FP), CX
  3113  	MOVQ 48(CX), CX
  3114  	MOVQ (CX)(R9*8), R9
  3115  
  3116  sequenceDecs_decodeSync_safe_amd64_skip_update:
  3117  	// Adjust offset
  3118  	MOVQ   s+0(FP), CX
  3119  	MOVQ   8(SP), R13
  3120  	CMPQ   AX, $0x01
  3121  	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
  3122  	MOVUPS 144(CX), X0
  3123  	MOVQ   R13, 144(CX)
  3124  	MOVUPS X0, 152(CX)
  3125  	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
  3126  
  3127  sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
  3128  	CMPQ 24(SP), $0x00000000
  3129  	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
  3130  	INCQ R13
  3131  	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  3132  
  3133  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
  3134  	TESTQ R13, R13
  3135  	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  3136  	MOVQ  144(CX), R13
  3137  	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
  3138  
  3139  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
  3140  	MOVQ    R13, AX
  3141  	XORQ    R14, R14
  3142  	MOVQ    $-1, R15
  3143  	CMPQ    R13, $0x03
  3144  	CMOVQEQ R14, AX
  3145  	CMOVQEQ R15, R14
  3146  	ADDQ    144(CX)(AX*8), R14
  3147  	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
  3148  	MOVQ    $0x00000001, R14
  3149  
  3150  sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
  3151  	CMPQ R13, $0x01
  3152  	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
  3153  	MOVQ 152(CX), AX
  3154  	MOVQ AX, 160(CX)
  3155  
  3156  sequenceDecs_decodeSync_safe_amd64_adjust_skip:
  3157  	MOVQ 144(CX), AX
  3158  	MOVQ AX, 152(CX)
  3159  	MOVQ R14, 144(CX)
  3160  	MOVQ R14, R13
  3161  
  3162  sequenceDecs_decodeSync_safe_amd64_after_adjust:
  3163  	MOVQ R13, 8(SP)
  3164  
  3165  	// Check values
  3166  	MOVQ  16(SP), AX
  3167  	MOVQ  24(SP), CX
  3168  	LEAQ  (AX)(CX*1), R14
  3169  	MOVQ  s+0(FP), R15
  3170  	ADDQ  R14, 256(R15)
  3171  	MOVQ  ctx+16(FP), R14
  3172  	SUBQ  CX, 104(R14)
  3173  	JS    error_not_enough_literals
  3174  	CMPQ  AX, $0x00020002
  3175  	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
  3176  	TESTQ R13, R13
  3177  	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
  3178  	TESTQ AX, AX
  3179  	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
  3180  
  3181  sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
  3182  	MOVQ 24(SP), AX
  3183  	MOVQ 8(SP), CX
  3184  	MOVQ 16(SP), R13
  3185  
  3186  	// Check if we have enough space in s.out
  3187  	LEAQ (AX)(R13*1), R14
  3188  	ADDQ R10, R14
  3189  	CMPQ R14, 32(SP)
  3190  	JA   error_not_enough_space
  3191  
  3192  	// Copy literals
  3193  	TESTQ AX, AX
  3194  	JZ    check_offset
  3195  	MOVQ  AX, R14
  3196  	SUBQ  $0x10, R14
  3197  	JB    copy_1_small
  3198  
  3199  copy_1_loop:
  3200  	MOVUPS (R11), X0
  3201  	MOVUPS X0, (R10)
  3202  	ADDQ   $0x10, R11
  3203  	ADDQ   $0x10, R10
  3204  	SUBQ   $0x10, R14
  3205  	JAE    copy_1_loop
  3206  	LEAQ   16(R11)(R14*1), R11
  3207  	LEAQ   16(R10)(R14*1), R10
  3208  	MOVUPS -16(R11), X0
  3209  	MOVUPS X0, -16(R10)
  3210  	JMP    copy_1_end
  3211  
  3212  copy_1_small:
  3213  	CMPQ AX, $0x03
  3214  	JE   copy_1_move_3
  3215  	JB   copy_1_move_1or2
  3216  	CMPQ AX, $0x08
  3217  	JB   copy_1_move_4through7
  3218  	JMP  copy_1_move_8through16
  3219  
  3220  copy_1_move_1or2:
  3221  	MOVB (R11), R14
  3222  	MOVB -1(R11)(AX*1), R15
  3223  	MOVB R14, (R10)
  3224  	MOVB R15, -1(R10)(AX*1)
  3225  	ADDQ AX, R11
  3226  	ADDQ AX, R10
  3227  	JMP  copy_1_end
  3228  
  3229  copy_1_move_3:
  3230  	MOVW (R11), R14
  3231  	MOVB 2(R11), R15
  3232  	MOVW R14, (R10)
  3233  	MOVB R15, 2(R10)
  3234  	ADDQ AX, R11
  3235  	ADDQ AX, R10
  3236  	JMP  copy_1_end
  3237  
  3238  copy_1_move_4through7:
  3239  	MOVL (R11), R14
  3240  	MOVL -4(R11)(AX*1), R15
  3241  	MOVL R14, (R10)
  3242  	MOVL R15, -4(R10)(AX*1)
  3243  	ADDQ AX, R11
  3244  	ADDQ AX, R10
  3245  	JMP  copy_1_end
  3246  
  3247  copy_1_move_8through16:
  3248  	MOVQ (R11), R14
  3249  	MOVQ -8(R11)(AX*1), R15
  3250  	MOVQ R14, (R10)
  3251  	MOVQ R15, -8(R10)(AX*1)
  3252  	ADDQ AX, R11
  3253  	ADDQ AX, R10
  3254  
  3255  copy_1_end:
  3256  	ADDQ AX, R12
  3257  
  3258  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3259  check_offset:
  3260  	MOVQ R12, AX
  3261  	ADDQ 40(SP), AX
  3262  	CMPQ CX, AX
  3263  	JG   error_match_off_too_big
  3264  	CMPQ CX, 56(SP)
  3265  	JG   error_match_off_too_big
  3266  
  3267  	// Copy match from history
  3268  	MOVQ CX, AX
  3269  	SUBQ R12, AX
  3270  	JLS  copy_match
  3271  	MOVQ 48(SP), R14
  3272  	SUBQ AX, R14
  3273  	CMPQ R13, AX
  3274  	JG   copy_all_from_history
  3275  	MOVQ R13, AX
  3276  	SUBQ $0x10, AX
  3277  	JB   copy_4_small
  3278  
  3279  copy_4_loop:
  3280  	MOVUPS (R14), X0
  3281  	MOVUPS X0, (R10)
  3282  	ADDQ   $0x10, R14
  3283  	ADDQ   $0x10, R10
  3284  	SUBQ   $0x10, AX
  3285  	JAE    copy_4_loop
  3286  	LEAQ   16(R14)(AX*1), R14
  3287  	LEAQ   16(R10)(AX*1), R10
  3288  	MOVUPS -16(R14), X0
  3289  	MOVUPS X0, -16(R10)
  3290  	JMP    copy_4_end
  3291  
  3292  copy_4_small:
  3293  	CMPQ R13, $0x03
  3294  	JE   copy_4_move_3
  3295  	CMPQ R13, $0x08
  3296  	JB   copy_4_move_4through7
  3297  	JMP  copy_4_move_8through16
  3298  
  3299  copy_4_move_3:
  3300  	MOVW (R14), AX
  3301  	MOVB 2(R14), CL
  3302  	MOVW AX, (R10)
  3303  	MOVB CL, 2(R10)
  3304  	ADDQ R13, R14
  3305  	ADDQ R13, R10
  3306  	JMP  copy_4_end
  3307  
  3308  copy_4_move_4through7:
  3309  	MOVL (R14), AX
  3310  	MOVL -4(R14)(R13*1), CX
  3311  	MOVL AX, (R10)
  3312  	MOVL CX, -4(R10)(R13*1)
  3313  	ADDQ R13, R14
  3314  	ADDQ R13, R10
  3315  	JMP  copy_4_end
  3316  
  3317  copy_4_move_8through16:
  3318  	MOVQ (R14), AX
  3319  	MOVQ -8(R14)(R13*1), CX
  3320  	MOVQ AX, (R10)
  3321  	MOVQ CX, -8(R10)(R13*1)
  3322  	ADDQ R13, R14
  3323  	ADDQ R13, R10
  3324  
  3325  copy_4_end:
  3326  	ADDQ R13, R12
  3327  	JMP  handle_loop
  3328  	JMP loop_finished
  3329  
  3330  copy_all_from_history:
  3331  	MOVQ AX, R15
  3332  	SUBQ $0x10, R15
  3333  	JB   copy_5_small
  3334  
  3335  copy_5_loop:
  3336  	MOVUPS (R14), X0
  3337  	MOVUPS X0, (R10)
  3338  	ADDQ   $0x10, R14
  3339  	ADDQ   $0x10, R10
  3340  	SUBQ   $0x10, R15
  3341  	JAE    copy_5_loop
  3342  	LEAQ   16(R14)(R15*1), R14
  3343  	LEAQ   16(R10)(R15*1), R10
  3344  	MOVUPS -16(R14), X0
  3345  	MOVUPS X0, -16(R10)
  3346  	JMP    copy_5_end
  3347  
  3348  copy_5_small:
  3349  	CMPQ AX, $0x03
  3350  	JE   copy_5_move_3
  3351  	JB   copy_5_move_1or2
  3352  	CMPQ AX, $0x08
  3353  	JB   copy_5_move_4through7
  3354  	JMP  copy_5_move_8through16
  3355  
  3356  copy_5_move_1or2:
  3357  	MOVB (R14), R15
  3358  	MOVB -1(R14)(AX*1), BP
  3359  	MOVB R15, (R10)
  3360  	MOVB BP, -1(R10)(AX*1)
  3361  	ADDQ AX, R14
  3362  	ADDQ AX, R10
  3363  	JMP  copy_5_end
  3364  
  3365  copy_5_move_3:
  3366  	MOVW (R14), R15
  3367  	MOVB 2(R14), BP
  3368  	MOVW R15, (R10)
  3369  	MOVB BP, 2(R10)
  3370  	ADDQ AX, R14
  3371  	ADDQ AX, R10
  3372  	JMP  copy_5_end
  3373  
  3374  copy_5_move_4through7:
  3375  	MOVL (R14), R15
  3376  	MOVL -4(R14)(AX*1), BP
  3377  	MOVL R15, (R10)
  3378  	MOVL BP, -4(R10)(AX*1)
  3379  	ADDQ AX, R14
  3380  	ADDQ AX, R10
  3381  	JMP  copy_5_end
  3382  
  3383  copy_5_move_8through16:
  3384  	MOVQ (R14), R15
  3385  	MOVQ -8(R14)(AX*1), BP
  3386  	MOVQ R15, (R10)
  3387  	MOVQ BP, -8(R10)(AX*1)
  3388  	ADDQ AX, R14
  3389  	ADDQ AX, R10
  3390  
  3391  copy_5_end:
  3392  	ADDQ AX, R12
  3393  	SUBQ AX, R13
  3394  
  3395  	// Copy match from the current buffer
  3396  copy_match:
  3397  	MOVQ R10, AX
  3398  	SUBQ CX, AX
  3399  
  3400  	// ml <= mo
  3401  	CMPQ R13, CX
  3402  	JA   copy_overlapping_match
  3403  
  3404  	// Copy non-overlapping match
  3405  	ADDQ R13, R12
  3406  	MOVQ R13, CX
  3407  	SUBQ $0x10, CX
  3408  	JB   copy_2_small
  3409  
  3410  copy_2_loop:
  3411  	MOVUPS (AX), X0
  3412  	MOVUPS X0, (R10)
  3413  	ADDQ   $0x10, AX
  3414  	ADDQ   $0x10, R10
  3415  	SUBQ   $0x10, CX
  3416  	JAE    copy_2_loop
  3417  	LEAQ   16(AX)(CX*1), AX
  3418  	LEAQ   16(R10)(CX*1), R10
  3419  	MOVUPS -16(AX), X0
  3420  	MOVUPS X0, -16(R10)
  3421  	JMP    copy_2_end
  3422  
  3423  copy_2_small:
  3424  	CMPQ R13, $0x03
  3425  	JE   copy_2_move_3
  3426  	JB   copy_2_move_1or2
  3427  	CMPQ R13, $0x08
  3428  	JB   copy_2_move_4through7
  3429  	JMP  copy_2_move_8through16
  3430  
  3431  copy_2_move_1or2:
  3432  	MOVB (AX), CL
  3433  	MOVB -1(AX)(R13*1), R14
  3434  	MOVB CL, (R10)
  3435  	MOVB R14, -1(R10)(R13*1)
  3436  	ADDQ R13, AX
  3437  	ADDQ R13, R10
  3438  	JMP  copy_2_end
  3439  
  3440  copy_2_move_3:
  3441  	MOVW (AX), CX
  3442  	MOVB 2(AX), R14
  3443  	MOVW CX, (R10)
  3444  	MOVB R14, 2(R10)
  3445  	ADDQ R13, AX
  3446  	ADDQ R13, R10
  3447  	JMP  copy_2_end
  3448  
  3449  copy_2_move_4through7:
  3450  	MOVL (AX), CX
  3451  	MOVL -4(AX)(R13*1), R14
  3452  	MOVL CX, (R10)
  3453  	MOVL R14, -4(R10)(R13*1)
  3454  	ADDQ R13, AX
  3455  	ADDQ R13, R10
  3456  	JMP  copy_2_end
  3457  
  3458  copy_2_move_8through16:
  3459  	MOVQ (AX), CX
  3460  	MOVQ -8(AX)(R13*1), R14
  3461  	MOVQ CX, (R10)
  3462  	MOVQ R14, -8(R10)(R13*1)
  3463  	ADDQ R13, AX
  3464  	ADDQ R13, R10
  3465  
  3466  copy_2_end:
  3467  	JMP handle_loop
  3468  
  3469  	// Copy overlapping match
  3470  copy_overlapping_match:
  3471  	ADDQ R13, R12
  3472  
  3473  copy_slow_3:
  3474  	MOVB (AX), CL
  3475  	MOVB CL, (R10)
  3476  	INCQ AX
  3477  	INCQ R10
  3478  	DECQ R13
  3479  	JNZ  copy_slow_3
  3480  
  3481  handle_loop:
  3482  	MOVQ ctx+16(FP), AX
  3483  	DECQ 96(AX)
  3484  	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
  3485  
  3486  loop_finished:
  3487  	MOVQ br+8(FP), AX
  3488  	MOVQ DX, 32(AX)
  3489  	MOVB BL, 40(AX)
  3490  	MOVQ SI, 24(AX)
  3491  
  3492  	// Update the context
  3493  	MOVQ ctx+16(FP), AX
  3494  	MOVQ R12, 136(AX)
  3495  	MOVQ 144(AX), CX
  3496  	SUBQ CX, R11
  3497  	MOVQ R11, 168(AX)
  3498  
  3499  	// Return success
  3500  	MOVQ $0x00000000, ret+24(FP)
  3501  	RET
  3502  
  3503  	// Return with match length error
  3504  sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
  3505  	MOVQ 16(SP), AX
  3506  	MOVQ ctx+16(FP), CX
  3507  	MOVQ AX, 216(CX)
  3508  	MOVQ $0x00000001, ret+24(FP)
  3509  	RET
  3510  
  3511  	// Return with match too long error
  3512  sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
  3513  	MOVQ ctx+16(FP), AX
  3514  	MOVQ 16(SP), CX
  3515  	MOVQ CX, 216(AX)
  3516  	MOVQ $0x00000002, ret+24(FP)
  3517  	RET
  3518  
  3519  	// Return with match offset too long error
  3520  error_match_off_too_big:
  3521  	MOVQ ctx+16(FP), AX
  3522  	MOVQ 8(SP), CX
  3523  	MOVQ CX, 224(AX)
  3524  	MOVQ R12, 136(AX)
  3525  	MOVQ $0x00000003, ret+24(FP)
  3526  	RET
  3527  
  3528  	// Return with not enough literals error
  3529  error_not_enough_literals:
  3530  	MOVQ ctx+16(FP), AX
  3531  	MOVQ 24(SP), CX
  3532  	MOVQ CX, 208(AX)
  3533  	MOVQ $0x00000004, ret+24(FP)
  3534  	RET
  3535  
  3536  	// Return with overread error
  3537  error_overread:
  3538  	MOVQ $0x00000006, ret+24(FP)
  3539  	RET
  3540  
  3541  	// Return with not enough output space error
  3542  error_not_enough_space:
  3543  	MOVQ ctx+16(FP), AX
  3544  	MOVQ 24(SP), CX
  3545  	MOVQ CX, 208(AX)
  3546  	MOVQ 16(SP), CX
  3547  	MOVQ CX, 216(AX)
  3548  	MOVQ R12, 136(AX)
  3549  	MOVQ $0x00000005, ret+24(FP)
  3550  	RET
  3551  
  3552  // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  3553  // Requires: BMI, BMI2, CMOV, SSE
  3554  TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
  3555  	MOVQ    br+8(FP), CX
  3556  	MOVQ    32(CX), AX
  3557  	MOVBQZX 40(CX), DX
  3558  	MOVQ    24(CX), BX
  3559  	MOVQ    (CX), CX
  3560  	ADDQ    BX, CX
  3561  	MOVQ    CX, (SP)
  3562  	MOVQ    ctx+16(FP), CX
  3563  	MOVQ    72(CX), SI
  3564  	MOVQ    80(CX), DI
  3565  	MOVQ    88(CX), R8
  3566  	XORQ    R9, R9
  3567  	MOVQ    R9, 8(SP)
  3568  	MOVQ    R9, 16(SP)
  3569  	MOVQ    R9, 24(SP)
  3570  	MOVQ    112(CX), R9
  3571  	MOVQ    128(CX), R10
  3572  	MOVQ    R10, 32(SP)
  3573  	MOVQ    144(CX), R10
  3574  	MOVQ    136(CX), R11
  3575  	MOVQ    200(CX), R12
  3576  	MOVQ    R12, 56(SP)
  3577  	MOVQ    176(CX), R12
  3578  	MOVQ    R12, 48(SP)
  3579  	MOVQ    184(CX), CX
  3580  	MOVQ    CX, 40(SP)
  3581  	MOVQ    40(SP), CX
  3582  	ADDQ    CX, 48(SP)
  3583  
  3584  	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  3585  	ADDQ R9, 32(SP)
  3586  
  3587  	// outBase += outPosition
  3588  	ADDQ R11, R9
  3589  
  3590  sequenceDecs_decodeSync_safe_bmi2_main_loop:
  3591  	MOVQ (SP), R12
  3592  
  3593  	// Fill bitreader to have enough for the offset and match length.
  3594  	CMPQ BX, $0x08
  3595  	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3596  	MOVQ DX, CX
  3597  	SHRQ $0x03, CX
  3598  	SUBQ CX, R12
  3599  	MOVQ (R12), AX
  3600  	SUBQ CX, BX
  3601  	ANDQ $0x07, DX
  3602  	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
  3603  
  3604  sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
  3605  	CMPQ    BX, $0x00
  3606  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
  3607  	CMPQ    DX, $0x07
  3608  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
  3609  	SHLQ    $0x08, AX
  3610  	SUBQ    $0x01, R12
  3611  	SUBQ    $0x01, BX
  3612  	SUBQ    $0x08, DX
  3613  	MOVBQZX (R12), CX
  3614  	ORQ     CX, AX
  3615  	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3616  
  3617  sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
  3618  	CMPQ DX, $0x40
  3619  	JA   error_overread
  3620  
  3621  sequenceDecs_decodeSync_safe_bmi2_fill_end:
  3622  	// Update offset
  3623  	MOVQ   $0x00000808, CX
  3624  	BEXTRQ CX, R8, R13
  3625  	MOVQ   AX, R14
  3626  	LEAQ   (DX)(R13*1), CX
  3627  	ROLQ   CL, R14
  3628  	BZHIQ  R13, R14, R14
  3629  	MOVQ   CX, DX
  3630  	MOVQ   R8, CX
  3631  	SHRQ   $0x20, CX
  3632  	ADDQ   R14, CX
  3633  	MOVQ   CX, 8(SP)
  3634  
  3635  	// Update match length
  3636  	MOVQ   $0x00000808, CX
  3637  	BEXTRQ CX, DI, R13
  3638  	MOVQ   AX, R14
  3639  	LEAQ   (DX)(R13*1), CX
  3640  	ROLQ   CL, R14
  3641  	BZHIQ  R13, R14, R14
  3642  	MOVQ   CX, DX
  3643  	MOVQ   DI, CX
  3644  	SHRQ   $0x20, CX
  3645  	ADDQ   R14, CX
  3646  	MOVQ   CX, 16(SP)
  3647  
  3648  	// Fill bitreader to have enough for the remaining
  3649  	CMPQ BX, $0x08
  3650  	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3651  	MOVQ DX, CX
  3652  	SHRQ $0x03, CX
  3653  	SUBQ CX, R12
  3654  	MOVQ (R12), AX
  3655  	SUBQ CX, BX
  3656  	ANDQ $0x07, DX
  3657  	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3658  
  3659  sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
  3660  	CMPQ    BX, $0x00
  3661  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
  3662  	CMPQ    DX, $0x07
  3663  	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3664  	SHLQ    $0x08, AX
  3665  	SUBQ    $0x01, R12
  3666  	SUBQ    $0x01, BX
  3667  	SUBQ    $0x08, DX
  3668  	MOVBQZX (R12), CX
  3669  	ORQ     CX, AX
  3670  	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3671  
  3672  sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
  3673  	CMPQ DX, $0x40
  3674  	JA   error_overread
  3675  
  3676  sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
  3677  	// Update literal length
  3678  	MOVQ   $0x00000808, CX
  3679  	BEXTRQ CX, SI, R13
  3680  	MOVQ   AX, R14
  3681  	LEAQ   (DX)(R13*1), CX
  3682  	ROLQ   CL, R14
  3683  	BZHIQ  R13, R14, R14
  3684  	MOVQ   CX, DX
  3685  	MOVQ   SI, CX
  3686  	SHRQ   $0x20, CX
  3687  	ADDQ   R14, CX
  3688  	MOVQ   CX, 24(SP)
  3689  
  3690  	// Fill bitreader for state updates
  3691  	MOVQ    R12, (SP)
  3692  	MOVQ    $0x00000808, CX
  3693  	BEXTRQ  CX, R8, R12
  3694  	MOVQ    ctx+16(FP), CX
  3695  	CMPQ    96(CX), $0x00
  3696  	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
  3697  	LEAQ    (SI)(DI*1), R13
  3698  	ADDQ    R8, R13
  3699  	MOVBQZX R13, R13
  3700  	LEAQ    (DX)(R13*1), CX
  3701  	MOVQ    AX, R14
  3702  	MOVQ    CX, DX
  3703  	ROLQ    CL, R14
  3704  	BZHIQ   R13, R14, R14
  3705  
  3706  	// Update Offset State
  3707  	BZHIQ  R8, R14, CX
  3708  	SHRXQ  R8, R14, R14
  3709  	MOVQ   $0x00001010, R13
  3710  	BEXTRQ R13, R8, R8
  3711  	ADDQ   CX, R8
  3712  
  3713  	// Load ctx.ofTable
  3714  	MOVQ ctx+16(FP), CX
  3715  	MOVQ 48(CX), CX
  3716  	MOVQ (CX)(R8*8), R8
  3717  
  3718  	// Update Match Length State
  3719  	BZHIQ  DI, R14, CX
  3720  	SHRXQ  DI, R14, R14
  3721  	MOVQ   $0x00001010, R13
  3722  	BEXTRQ R13, DI, DI
  3723  	ADDQ   CX, DI
  3724  
  3725  	// Load ctx.mlTable
  3726  	MOVQ ctx+16(FP), CX
  3727  	MOVQ 24(CX), CX
  3728  	MOVQ (CX)(DI*8), DI
  3729  
  3730  	// Update Literal Length State
  3731  	BZHIQ  SI, R14, CX
  3732  	MOVQ   $0x00001010, R13
  3733  	BEXTRQ R13, SI, SI
  3734  	ADDQ   CX, SI
  3735  
  3736  	// Load ctx.llTable
  3737  	MOVQ ctx+16(FP), CX
  3738  	MOVQ (CX), CX
  3739  	MOVQ (CX)(SI*8), SI
  3740  
  3741  sequenceDecs_decodeSync_safe_bmi2_skip_update:
  3742  	// Adjust offset
  3743  	MOVQ   s+0(FP), CX
  3744  	MOVQ   8(SP), R13
  3745  	CMPQ   R12, $0x01
  3746  	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
  3747  	MOVUPS 144(CX), X0
  3748  	MOVQ   R13, 144(CX)
  3749  	MOVUPS X0, 152(CX)
  3750  	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3751  
  3752  sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
  3753  	CMPQ 24(SP), $0x00000000
  3754  	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
  3755  	INCQ R13
  3756  	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3757  
  3758  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
  3759  	TESTQ R13, R13
  3760  	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3761  	MOVQ  144(CX), R13
  3762  	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3763  
  3764  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
  3765  	MOVQ    R13, R12
  3766  	XORQ    R14, R14
  3767  	MOVQ    $-1, R15
  3768  	CMPQ    R13, $0x03
  3769  	CMOVQEQ R14, R12
  3770  	CMOVQEQ R15, R14
  3771  	ADDQ    144(CX)(R12*8), R14
  3772  	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
  3773  	MOVQ    $0x00000001, R14
  3774  
  3775  sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
  3776  	CMPQ R13, $0x01
  3777  	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
  3778  	MOVQ 152(CX), R12
  3779  	MOVQ R12, 160(CX)
  3780  
  3781  sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
  3782  	MOVQ 144(CX), R12
  3783  	MOVQ R12, 152(CX)
  3784  	MOVQ R14, 144(CX)
  3785  	MOVQ R14, R13
  3786  
  3787  sequenceDecs_decodeSync_safe_bmi2_after_adjust:
  3788  	MOVQ R13, 8(SP)
  3789  
  3790  	// Check values
  3791  	MOVQ  16(SP), CX
  3792  	MOVQ  24(SP), R12
  3793  	LEAQ  (CX)(R12*1), R14
  3794  	MOVQ  s+0(FP), R15
  3795  	ADDQ  R14, 256(R15)
  3796  	MOVQ  ctx+16(FP), R14
  3797  	SUBQ  R12, 104(R14)
  3798  	JS    error_not_enough_literals
  3799  	CMPQ  CX, $0x00020002
  3800  	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
  3801  	TESTQ R13, R13
  3802  	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
  3803  	TESTQ CX, CX
  3804  	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
  3805  
  3806  sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
  3807  	MOVQ 24(SP), CX
  3808  	MOVQ 8(SP), R12
  3809  	MOVQ 16(SP), R13
  3810  
  3811  	// Check if we have enough space in s.out
  3812  	LEAQ (CX)(R13*1), R14
  3813  	ADDQ R9, R14
  3814  	CMPQ R14, 32(SP)
  3815  	JA   error_not_enough_space
  3816  
  3817  	// Copy literals
  3818  	TESTQ CX, CX
  3819  	JZ    check_offset
  3820  	MOVQ  CX, R14
  3821  	SUBQ  $0x10, R14
  3822  	JB    copy_1_small
  3823  
  3824  copy_1_loop:
  3825  	MOVUPS (R10), X0
  3826  	MOVUPS X0, (R9)
  3827  	ADDQ   $0x10, R10
  3828  	ADDQ   $0x10, R9
  3829  	SUBQ   $0x10, R14
  3830  	JAE    copy_1_loop
  3831  	LEAQ   16(R10)(R14*1), R10
  3832  	LEAQ   16(R9)(R14*1), R9
  3833  	MOVUPS -16(R10), X0
  3834  	MOVUPS X0, -16(R9)
  3835  	JMP    copy_1_end
  3836  
  3837  copy_1_small:
  3838  	CMPQ CX, $0x03
  3839  	JE   copy_1_move_3
  3840  	JB   copy_1_move_1or2
  3841  	CMPQ CX, $0x08
  3842  	JB   copy_1_move_4through7
  3843  	JMP  copy_1_move_8through16
  3844  
  3845  copy_1_move_1or2:
  3846  	MOVB (R10), R14
  3847  	MOVB -1(R10)(CX*1), R15
  3848  	MOVB R14, (R9)
  3849  	MOVB R15, -1(R9)(CX*1)
  3850  	ADDQ CX, R10
  3851  	ADDQ CX, R9
  3852  	JMP  copy_1_end
  3853  
  3854  copy_1_move_3:
  3855  	MOVW (R10), R14
  3856  	MOVB 2(R10), R15
  3857  	MOVW R14, (R9)
  3858  	MOVB R15, 2(R9)
  3859  	ADDQ CX, R10
  3860  	ADDQ CX, R9
  3861  	JMP  copy_1_end
  3862  
  3863  copy_1_move_4through7:
  3864  	MOVL (R10), R14
  3865  	MOVL -4(R10)(CX*1), R15
  3866  	MOVL R14, (R9)
  3867  	MOVL R15, -4(R9)(CX*1)
  3868  	ADDQ CX, R10
  3869  	ADDQ CX, R9
  3870  	JMP  copy_1_end
  3871  
  3872  copy_1_move_8through16:
  3873  	MOVQ (R10), R14
  3874  	MOVQ -8(R10)(CX*1), R15
  3875  	MOVQ R14, (R9)
  3876  	MOVQ R15, -8(R9)(CX*1)
  3877  	ADDQ CX, R10
  3878  	ADDQ CX, R9
  3879  
  3880  copy_1_end:
  3881  	ADDQ CX, R11
  3882  
  3883  	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3884  check_offset:
  3885  	MOVQ R11, CX
  3886  	ADDQ 40(SP), CX
  3887  	CMPQ R12, CX
  3888  	JG   error_match_off_too_big
  3889  	CMPQ R12, 56(SP)
  3890  	JG   error_match_off_too_big
  3891  
  3892  	// Copy match from history
  3893  	MOVQ R12, CX
  3894  	SUBQ R11, CX
  3895  	JLS  copy_match
  3896  	MOVQ 48(SP), R14
  3897  	SUBQ CX, R14
  3898  	CMPQ R13, CX
  3899  	JG   copy_all_from_history
  3900  	MOVQ R13, CX
  3901  	SUBQ $0x10, CX
  3902  	JB   copy_4_small
  3903  
  3904  copy_4_loop:
  3905  	MOVUPS (R14), X0
  3906  	MOVUPS X0, (R9)
  3907  	ADDQ   $0x10, R14
  3908  	ADDQ   $0x10, R9
  3909  	SUBQ   $0x10, CX
  3910  	JAE    copy_4_loop
  3911  	LEAQ   16(R14)(CX*1), R14
  3912  	LEAQ   16(R9)(CX*1), R9
  3913  	MOVUPS -16(R14), X0
  3914  	MOVUPS X0, -16(R9)
  3915  	JMP    copy_4_end
  3916  
  3917  copy_4_small:
  3918  	CMPQ R13, $0x03
  3919  	JE   copy_4_move_3
  3920  	CMPQ R13, $0x08
  3921  	JB   copy_4_move_4through7
  3922  	JMP  copy_4_move_8through16
  3923  
  3924  copy_4_move_3:
  3925  	MOVW (R14), CX
  3926  	MOVB 2(R14), R12
  3927  	MOVW CX, (R9)
  3928  	MOVB R12, 2(R9)
  3929  	ADDQ R13, R14
  3930  	ADDQ R13, R9
  3931  	JMP  copy_4_end
  3932  
  3933  copy_4_move_4through7:
  3934  	MOVL (R14), CX
  3935  	MOVL -4(R14)(R13*1), R12
  3936  	MOVL CX, (R9)
  3937  	MOVL R12, -4(R9)(R13*1)
  3938  	ADDQ R13, R14
  3939  	ADDQ R13, R9
  3940  	JMP  copy_4_end
  3941  
  3942  copy_4_move_8through16:
  3943  	MOVQ (R14), CX
  3944  	MOVQ -8(R14)(R13*1), R12
  3945  	MOVQ CX, (R9)
  3946  	MOVQ R12, -8(R9)(R13*1)
  3947  	ADDQ R13, R14
  3948  	ADDQ R13, R9
  3949  
  3950  copy_4_end:
  3951  	ADDQ R13, R11
  3952  	JMP  handle_loop
  3953  	JMP loop_finished
  3954  
  3955  copy_all_from_history:
  3956  	MOVQ CX, R15
  3957  	SUBQ $0x10, R15
  3958  	JB   copy_5_small
  3959  
  3960  copy_5_loop:
  3961  	MOVUPS (R14), X0
  3962  	MOVUPS X0, (R9)
  3963  	ADDQ   $0x10, R14
  3964  	ADDQ   $0x10, R9
  3965  	SUBQ   $0x10, R15
  3966  	JAE    copy_5_loop
  3967  	LEAQ   16(R14)(R15*1), R14
  3968  	LEAQ   16(R9)(R15*1), R9
  3969  	MOVUPS -16(R14), X0
  3970  	MOVUPS X0, -16(R9)
  3971  	JMP    copy_5_end
  3972  
  3973  copy_5_small:
  3974  	CMPQ CX, $0x03
  3975  	JE   copy_5_move_3
  3976  	JB   copy_5_move_1or2
  3977  	CMPQ CX, $0x08
  3978  	JB   copy_5_move_4through7
  3979  	JMP  copy_5_move_8through16
  3980  
  3981  copy_5_move_1or2:
  3982  	MOVB (R14), R15
  3983  	MOVB -1(R14)(CX*1), BP
  3984  	MOVB R15, (R9)
  3985  	MOVB BP, -1(R9)(CX*1)
  3986  	ADDQ CX, R14
  3987  	ADDQ CX, R9
  3988  	JMP  copy_5_end
  3989  
  3990  copy_5_move_3:
  3991  	MOVW (R14), R15
  3992  	MOVB 2(R14), BP
  3993  	MOVW R15, (R9)
  3994  	MOVB BP, 2(R9)
  3995  	ADDQ CX, R14
  3996  	ADDQ CX, R9
  3997  	JMP  copy_5_end
  3998  
  3999  copy_5_move_4through7:
  4000  	MOVL (R14), R15
  4001  	MOVL -4(R14)(CX*1), BP
  4002  	MOVL R15, (R9)
  4003  	MOVL BP, -4(R9)(CX*1)
  4004  	ADDQ CX, R14
  4005  	ADDQ CX, R9
  4006  	JMP  copy_5_end
  4007  
  4008  copy_5_move_8through16:
  4009  	MOVQ (R14), R15
  4010  	MOVQ -8(R14)(CX*1), BP
  4011  	MOVQ R15, (R9)
  4012  	MOVQ BP, -8(R9)(CX*1)
  4013  	ADDQ CX, R14
  4014  	ADDQ CX, R9
  4015  
  4016  copy_5_end:
  4017  	ADDQ CX, R11
  4018  	SUBQ CX, R13
  4019  
  4020  	// Copy match from the current buffer
  4021  copy_match:
  4022  	MOVQ R9, CX
  4023  	SUBQ R12, CX
  4024  
  4025  	// ml <= mo
  4026  	CMPQ R13, R12
  4027  	JA   copy_overlapping_match
  4028  
  4029  	// Copy non-overlapping match
  4030  	ADDQ R13, R11
  4031  	MOVQ R13, R12
  4032  	SUBQ $0x10, R12
  4033  	JB   copy_2_small
  4034  
  4035  copy_2_loop:
  4036  	MOVUPS (CX), X0
  4037  	MOVUPS X0, (R9)
  4038  	ADDQ   $0x10, CX
  4039  	ADDQ   $0x10, R9
  4040  	SUBQ   $0x10, R12
  4041  	JAE    copy_2_loop
  4042  	LEAQ   16(CX)(R12*1), CX
  4043  	LEAQ   16(R9)(R12*1), R9
  4044  	MOVUPS -16(CX), X0
  4045  	MOVUPS X0, -16(R9)
  4046  	JMP    copy_2_end
  4047  
  4048  copy_2_small:
  4049  	CMPQ R13, $0x03
  4050  	JE   copy_2_move_3
  4051  	JB   copy_2_move_1or2
  4052  	CMPQ R13, $0x08
  4053  	JB   copy_2_move_4through7
  4054  	JMP  copy_2_move_8through16
  4055  
  4056  copy_2_move_1or2:
  4057  	MOVB (CX), R12
  4058  	MOVB -1(CX)(R13*1), R14
  4059  	MOVB R12, (R9)
  4060  	MOVB R14, -1(R9)(R13*1)
  4061  	ADDQ R13, CX
  4062  	ADDQ R13, R9
  4063  	JMP  copy_2_end
  4064  
  4065  copy_2_move_3:
  4066  	MOVW (CX), R12
  4067  	MOVB 2(CX), R14
  4068  	MOVW R12, (R9)
  4069  	MOVB R14, 2(R9)
  4070  	ADDQ R13, CX
  4071  	ADDQ R13, R9
  4072  	JMP  copy_2_end
  4073  
  4074  copy_2_move_4through7:
  4075  	MOVL (CX), R12
  4076  	MOVL -4(CX)(R13*1), R14
  4077  	MOVL R12, (R9)
  4078  	MOVL R14, -4(R9)(R13*1)
  4079  	ADDQ R13, CX
  4080  	ADDQ R13, R9
  4081  	JMP  copy_2_end
  4082  
  4083  copy_2_move_8through16:
  4084  	MOVQ (CX), R12
  4085  	MOVQ -8(CX)(R13*1), R14
  4086  	MOVQ R12, (R9)
  4087  	MOVQ R14, -8(R9)(R13*1)
  4088  	ADDQ R13, CX
  4089  	ADDQ R13, R9
  4090  
  4091  copy_2_end:
  4092  	JMP handle_loop
  4093  
  4094  	// Copy overlapping match
  4095  copy_overlapping_match:
  4096  	ADDQ R13, R11
  4097  
  4098  copy_slow_3:
  4099  	MOVB (CX), R12
  4100  	MOVB R12, (R9)
  4101  	INCQ CX
  4102  	INCQ R9
  4103  	DECQ R13
  4104  	JNZ  copy_slow_3
  4105  
  4106  handle_loop:
  4107  	MOVQ ctx+16(FP), CX
  4108  	DECQ 96(CX)
  4109  	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
  4110  
  4111  loop_finished:
  4112  	MOVQ br+8(FP), CX
  4113  	MOVQ AX, 32(CX)
  4114  	MOVB DL, 40(CX)
  4115  	MOVQ BX, 24(CX)
  4116  
  4117  	// Update the context
  4118  	MOVQ ctx+16(FP), AX
  4119  	MOVQ R11, 136(AX)
  4120  	MOVQ 144(AX), CX
  4121  	SUBQ CX, R10
  4122  	MOVQ R10, 168(AX)
  4123  
  4124  	// Return success
  4125  	MOVQ $0x00000000, ret+24(FP)
  4126  	RET
  4127  
  4128  	// Return with match length error
  4129  sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
  4130  	MOVQ 16(SP), AX
  4131  	MOVQ ctx+16(FP), CX
  4132  	MOVQ AX, 216(CX)
  4133  	MOVQ $0x00000001, ret+24(FP)
  4134  	RET
  4135  
  4136  	// Return with match too long error
  4137  sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
  4138  	MOVQ ctx+16(FP), AX
  4139  	MOVQ 16(SP), CX
  4140  	MOVQ CX, 216(AX)
  4141  	MOVQ $0x00000002, ret+24(FP)
  4142  	RET
  4143  
  4144  	// Return with match offset too long error
  4145  error_match_off_too_big:
  4146  	MOVQ ctx+16(FP), AX
  4147  	MOVQ 8(SP), CX
  4148  	MOVQ CX, 224(AX)
  4149  	MOVQ R11, 136(AX)
  4150  	MOVQ $0x00000003, ret+24(FP)
  4151  	RET
  4152  
  4153  	// Return with not enough literals error
  4154  error_not_enough_literals:
  4155  	MOVQ ctx+16(FP), AX
  4156  	MOVQ 24(SP), CX
  4157  	MOVQ CX, 208(AX)
  4158  	MOVQ $0x00000004, ret+24(FP)
  4159  	RET
  4160  
  4161  	// Return with overread error
  4162  error_overread:
  4163  	MOVQ $0x00000006, ret+24(FP)
  4164  	RET
  4165  
  4166  	// Return with not enough output space error
  4167  error_not_enough_space:
  4168  	MOVQ ctx+16(FP), AX
  4169  	MOVQ 24(SP), CX
  4170  	MOVQ CX, 208(AX)
  4171  	MOVQ 16(SP), CX
  4172  	MOVQ CX, 216(AX)
  4173  	MOVQ R11, 136(AX)
  4174  	MOVQ $0x00000005, ret+24(FP)
  4175  	RET