github.com/bir3/gocompiler@v0.9.2202/extra/compress/huff0/decompress_amd64.s (about)

     1  // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
     2  
     3  //go:build amd64 && !appengine && !noasm && gc
     4  
     5  // func decompress4x_main_loop_amd64(ctx *decompress4xContext)
     6  TEXT ·decompress4x_main_loop_amd64(SB), $0-8
     7  	// Preload values
     8  	MOVQ    ctx+0(FP), AX
     9  	MOVBQZX 8(AX), DI
    10  	MOVQ    16(AX), BX
    11  	MOVQ    48(AX), SI
    12  	MOVQ    24(AX), R8
    13  	MOVQ    32(AX), R9
    14  	MOVQ    (AX), R10
    15  
    16  	// Main loop
    17  main_loop:
    18  	XORL  DX, DX
    19  	CMPQ  BX, SI
    20  	SETGE DL
    21  
    22  	// br0.fillFast32()
    23  	MOVQ    32(R10), R11
    24  	MOVBQZX 40(R10), R12
    25  	CMPQ    R12, $0x20
    26  	JBE     skip_fill0
    27  	MOVQ    24(R10), AX
    28  	SUBQ    $0x20, R12
    29  	SUBQ    $0x04, AX
    30  	MOVQ    (R10), R13
    31  
    32  	// b.value |= uint64(low) << (b.bitsRead & 63)
    33  	MOVL (AX)(R13*1), R13
    34  	MOVQ R12, CX
    35  	SHLQ CL, R13
    36  	MOVQ AX, 24(R10)
    37  	ORQ  R13, R11
    38  
    39  	// exhausted += (br0.off < 4)
    40  	CMPQ AX, $0x04
    41  	ADCB $+0, DL
    42  
    43  skip_fill0:
    44  	// val0 := br0.peekTopBits(peekBits)
    45  	MOVQ R11, R13
    46  	MOVQ DI, CX
    47  	SHRQ CL, R13
    48  
    49  	// v0 := table[val0&mask]
    50  	MOVW (R9)(R13*2), CX
    51  
    52  	// br0.advance(uint8(v0.entry)
    53  	MOVB CH, AL
    54  	SHLQ CL, R11
    55  	ADDB CL, R12
    56  
    57  	// val1 := br0.peekTopBits(peekBits)
    58  	MOVQ DI, CX
    59  	MOVQ R11, R13
    60  	SHRQ CL, R13
    61  
    62  	// v1 := table[val1&mask]
    63  	MOVW (R9)(R13*2), CX
    64  
    65  	// br0.advance(uint8(v1.entry))
    66  	MOVB CH, AH
    67  	SHLQ CL, R11
    68  	ADDB CL, R12
    69  
    70  	// these two writes get coalesced
    71  	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
    72  	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
    73  	MOVW AX, (BX)
    74  
    75  	// update the bitreader structure
    76  	MOVQ R11, 32(R10)
    77  	MOVB R12, 40(R10)
    78  
    79  	// br1.fillFast32()
    80  	MOVQ    80(R10), R11
    81  	MOVBQZX 88(R10), R12
    82  	CMPQ    R12, $0x20
    83  	JBE     skip_fill1
    84  	MOVQ    72(R10), AX
    85  	SUBQ    $0x20, R12
    86  	SUBQ    $0x04, AX
    87  	MOVQ    48(R10), R13
    88  
    89  	// b.value |= uint64(low) << (b.bitsRead & 63)
    90  	MOVL (AX)(R13*1), R13
    91  	MOVQ R12, CX
    92  	SHLQ CL, R13
    93  	MOVQ AX, 72(R10)
    94  	ORQ  R13, R11
    95  
    96  	// exhausted += (br1.off < 4)
    97  	CMPQ AX, $0x04
    98  	ADCB $+0, DL
    99  
   100  skip_fill1:
   101  	// val0 := br1.peekTopBits(peekBits)
   102  	MOVQ R11, R13
   103  	MOVQ DI, CX
   104  	SHRQ CL, R13
   105  
   106  	// v0 := table[val0&mask]
   107  	MOVW (R9)(R13*2), CX
   108  
   109  	// br1.advance(uint8(v0.entry)
   110  	MOVB CH, AL
   111  	SHLQ CL, R11
   112  	ADDB CL, R12
   113  
   114  	// val1 := br1.peekTopBits(peekBits)
   115  	MOVQ DI, CX
   116  	MOVQ R11, R13
   117  	SHRQ CL, R13
   118  
   119  	// v1 := table[val1&mask]
   120  	MOVW (R9)(R13*2), CX
   121  
   122  	// br1.advance(uint8(v1.entry))
   123  	MOVB CH, AH
   124  	SHLQ CL, R11
   125  	ADDB CL, R12
   126  
   127  	// these two writes get coalesced
   128  	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   129  	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   130  	MOVW AX, (BX)(R8*1)
   131  
   132  	// update the bitreader structure
   133  	MOVQ R11, 80(R10)
   134  	MOVB R12, 88(R10)
   135  
   136  	// br2.fillFast32()
   137  	MOVQ    128(R10), R11
   138  	MOVBQZX 136(R10), R12
   139  	CMPQ    R12, $0x20
   140  	JBE     skip_fill2
   141  	MOVQ    120(R10), AX
   142  	SUBQ    $0x20, R12
   143  	SUBQ    $0x04, AX
   144  	MOVQ    96(R10), R13
   145  
   146  	// b.value |= uint64(low) << (b.bitsRead & 63)
   147  	MOVL (AX)(R13*1), R13
   148  	MOVQ R12, CX
   149  	SHLQ CL, R13
   150  	MOVQ AX, 120(R10)
   151  	ORQ  R13, R11
   152  
   153  	// exhausted += (br2.off < 4)
   154  	CMPQ AX, $0x04
   155  	ADCB $+0, DL
   156  
   157  skip_fill2:
   158  	// val0 := br2.peekTopBits(peekBits)
   159  	MOVQ R11, R13
   160  	MOVQ DI, CX
   161  	SHRQ CL, R13
   162  
   163  	// v0 := table[val0&mask]
   164  	MOVW (R9)(R13*2), CX
   165  
   166  	// br2.advance(uint8(v0.entry)
   167  	MOVB CH, AL
   168  	SHLQ CL, R11
   169  	ADDB CL, R12
   170  
   171  	// val1 := br2.peekTopBits(peekBits)
   172  	MOVQ DI, CX
   173  	MOVQ R11, R13
   174  	SHRQ CL, R13
   175  
   176  	// v1 := table[val1&mask]
   177  	MOVW (R9)(R13*2), CX
   178  
   179  	// br2.advance(uint8(v1.entry))
   180  	MOVB CH, AH
   181  	SHLQ CL, R11
   182  	ADDB CL, R12
   183  
   184  	// these two writes get coalesced
   185  	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   186  	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   187  	MOVW AX, (BX)(R8*2)
   188  
   189  	// update the bitreader structure
   190  	MOVQ R11, 128(R10)
   191  	MOVB R12, 136(R10)
   192  
   193  	// br3.fillFast32()
   194  	MOVQ    176(R10), R11
   195  	MOVBQZX 184(R10), R12
   196  	CMPQ    R12, $0x20
   197  	JBE     skip_fill3
   198  	MOVQ    168(R10), AX
   199  	SUBQ    $0x20, R12
   200  	SUBQ    $0x04, AX
   201  	MOVQ    144(R10), R13
   202  
   203  	// b.value |= uint64(low) << (b.bitsRead & 63)
   204  	MOVL (AX)(R13*1), R13
   205  	MOVQ R12, CX
   206  	SHLQ CL, R13
   207  	MOVQ AX, 168(R10)
   208  	ORQ  R13, R11
   209  
   210  	// exhausted += (br3.off < 4)
   211  	CMPQ AX, $0x04
   212  	ADCB $+0, DL
   213  
   214  skip_fill3:
   215  	// val0 := br3.peekTopBits(peekBits)
   216  	MOVQ R11, R13
   217  	MOVQ DI, CX
   218  	SHRQ CL, R13
   219  
   220  	// v0 := table[val0&mask]
   221  	MOVW (R9)(R13*2), CX
   222  
   223  	// br3.advance(uint8(v0.entry)
   224  	MOVB CH, AL
   225  	SHLQ CL, R11
   226  	ADDB CL, R12
   227  
   228  	// val1 := br3.peekTopBits(peekBits)
   229  	MOVQ DI, CX
   230  	MOVQ R11, R13
   231  	SHRQ CL, R13
   232  
   233  	// v1 := table[val1&mask]
   234  	MOVW (R9)(R13*2), CX
   235  
   236  	// br3.advance(uint8(v1.entry))
   237  	MOVB CH, AH
   238  	SHLQ CL, R11
   239  	ADDB CL, R12
   240  
   241  	// these two writes get coalesced
   242  	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   243  	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   244  	LEAQ (R8)(R8*2), CX
   245  	MOVW AX, (BX)(CX*1)
   246  
   247  	// update the bitreader structure
   248  	MOVQ  R11, 176(R10)
   249  	MOVB  R12, 184(R10)
   250  	ADDQ  $0x02, BX
   251  	TESTB DL, DL
   252  	JZ    main_loop
   253  	MOVQ  ctx+0(FP), AX
   254  	SUBQ  16(AX), BX
   255  	SHLQ  $0x02, BX
   256  	MOVQ  BX, 40(AX)
   257  	RET
   258  
   259  // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
   260  TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
   261  	// Preload values
   262  	MOVQ    ctx+0(FP), CX
   263  	MOVBQZX 8(CX), DI
   264  	MOVQ    16(CX), BX
   265  	MOVQ    48(CX), SI
   266  	MOVQ    24(CX), R8
   267  	MOVQ    32(CX), R9
   268  	MOVQ    (CX), R10
   269  
   270  	// Main loop
   271  main_loop:
   272  	XORL  DX, DX
   273  	CMPQ  BX, SI
   274  	SETGE DL
   275  
   276  	// br0.fillFast32()
   277  	MOVQ    32(R10), R11
   278  	MOVBQZX 40(R10), R12
   279  	CMPQ    R12, $0x20
   280  	JBE     skip_fill0
   281  	MOVQ    24(R10), R13
   282  	SUBQ    $0x20, R12
   283  	SUBQ    $0x04, R13
   284  	MOVQ    (R10), R14
   285  
   286  	// b.value |= uint64(low) << (b.bitsRead & 63)
   287  	MOVL (R13)(R14*1), R14
   288  	MOVQ R12, CX
   289  	SHLQ CL, R14
   290  	MOVQ R13, 24(R10)
   291  	ORQ  R14, R11
   292  
   293  	// exhausted += (br0.off < 4)
   294  	CMPQ R13, $0x04
   295  	ADCB $+0, DL
   296  
   297  skip_fill0:
   298  	// val0 := br0.peekTopBits(peekBits)
   299  	MOVQ R11, R13
   300  	MOVQ DI, CX
   301  	SHRQ CL, R13
   302  
   303  	// v0 := table[val0&mask]
   304  	MOVW (R9)(R13*2), CX
   305  
   306  	// br0.advance(uint8(v0.entry)
   307  	MOVB CH, AL
   308  	SHLQ CL, R11
   309  	ADDB CL, R12
   310  
   311  	// val1 := br0.peekTopBits(peekBits)
   312  	MOVQ R11, R13
   313  	MOVQ DI, CX
   314  	SHRQ CL, R13
   315  
   316  	// v1 := table[val0&mask]
   317  	MOVW (R9)(R13*2), CX
   318  
   319  	// br0.advance(uint8(v1.entry)
   320  	MOVB   CH, AH
   321  	SHLQ   CL, R11
   322  	ADDB   CL, R12
   323  	BSWAPL AX
   324  
   325  	// val2 := br0.peekTopBits(peekBits)
   326  	MOVQ R11, R13
   327  	MOVQ DI, CX
   328  	SHRQ CL, R13
   329  
   330  	// v2 := table[val0&mask]
   331  	MOVW (R9)(R13*2), CX
   332  
   333  	// br0.advance(uint8(v2.entry)
   334  	MOVB CH, AH
   335  	SHLQ CL, R11
   336  	ADDB CL, R12
   337  
   338  	// val3 := br0.peekTopBits(peekBits)
   339  	MOVQ R11, R13
   340  	MOVQ DI, CX
   341  	SHRQ CL, R13
   342  
   343  	// v3 := table[val0&mask]
   344  	MOVW (R9)(R13*2), CX
   345  
   346  	// br0.advance(uint8(v3.entry)
   347  	MOVB   CH, AL
   348  	SHLQ   CL, R11
   349  	ADDB   CL, R12
   350  	BSWAPL AX
   351  
   352  	// these four writes get coalesced
   353  	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   354  	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   355  	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
   356  	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
   357  	MOVL AX, (BX)
   358  
   359  	// update the bitreader structure
   360  	MOVQ R11, 32(R10)
   361  	MOVB R12, 40(R10)
   362  
   363  	// br1.fillFast32()
   364  	MOVQ    80(R10), R11
   365  	MOVBQZX 88(R10), R12
   366  	CMPQ    R12, $0x20
   367  	JBE     skip_fill1
   368  	MOVQ    72(R10), R13
   369  	SUBQ    $0x20, R12
   370  	SUBQ    $0x04, R13
   371  	MOVQ    48(R10), R14
   372  
   373  	// b.value |= uint64(low) << (b.bitsRead & 63)
   374  	MOVL (R13)(R14*1), R14
   375  	MOVQ R12, CX
   376  	SHLQ CL, R14
   377  	MOVQ R13, 72(R10)
   378  	ORQ  R14, R11
   379  
   380  	// exhausted += (br1.off < 4)
   381  	CMPQ R13, $0x04
   382  	ADCB $+0, DL
   383  
   384  skip_fill1:
   385  	// val0 := br1.peekTopBits(peekBits)
   386  	MOVQ R11, R13
   387  	MOVQ DI, CX
   388  	SHRQ CL, R13
   389  
   390  	// v0 := table[val0&mask]
   391  	MOVW (R9)(R13*2), CX
   392  
   393  	// br1.advance(uint8(v0.entry)
   394  	MOVB CH, AL
   395  	SHLQ CL, R11
   396  	ADDB CL, R12
   397  
   398  	// val1 := br1.peekTopBits(peekBits)
   399  	MOVQ R11, R13
   400  	MOVQ DI, CX
   401  	SHRQ CL, R13
   402  
   403  	// v1 := table[val0&mask]
   404  	MOVW (R9)(R13*2), CX
   405  
   406  	// br1.advance(uint8(v1.entry)
   407  	MOVB   CH, AH
   408  	SHLQ   CL, R11
   409  	ADDB   CL, R12
   410  	BSWAPL AX
   411  
   412  	// val2 := br1.peekTopBits(peekBits)
   413  	MOVQ R11, R13
   414  	MOVQ DI, CX
   415  	SHRQ CL, R13
   416  
   417  	// v2 := table[val0&mask]
   418  	MOVW (R9)(R13*2), CX
   419  
   420  	// br1.advance(uint8(v2.entry)
   421  	MOVB CH, AH
   422  	SHLQ CL, R11
   423  	ADDB CL, R12
   424  
   425  	// val3 := br1.peekTopBits(peekBits)
   426  	MOVQ R11, R13
   427  	MOVQ DI, CX
   428  	SHRQ CL, R13
   429  
   430  	// v3 := table[val0&mask]
   431  	MOVW (R9)(R13*2), CX
   432  
   433  	// br1.advance(uint8(v3.entry)
   434  	MOVB   CH, AL
   435  	SHLQ   CL, R11
   436  	ADDB   CL, R12
   437  	BSWAPL AX
   438  
   439  	// these four writes get coalesced
   440  	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   441  	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   442  	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
   443  	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
   444  	MOVL AX, (BX)(R8*1)
   445  
   446  	// update the bitreader structure
   447  	MOVQ R11, 80(R10)
   448  	MOVB R12, 88(R10)
   449  
   450  	// br2.fillFast32()
   451  	MOVQ    128(R10), R11
   452  	MOVBQZX 136(R10), R12
   453  	CMPQ    R12, $0x20
   454  	JBE     skip_fill2
   455  	MOVQ    120(R10), R13
   456  	SUBQ    $0x20, R12
   457  	SUBQ    $0x04, R13
   458  	MOVQ    96(R10), R14
   459  
   460  	// b.value |= uint64(low) << (b.bitsRead & 63)
   461  	MOVL (R13)(R14*1), R14
   462  	MOVQ R12, CX
   463  	SHLQ CL, R14
   464  	MOVQ R13, 120(R10)
   465  	ORQ  R14, R11
   466  
   467  	// exhausted += (br2.off < 4)
   468  	CMPQ R13, $0x04
   469  	ADCB $+0, DL
   470  
   471  skip_fill2:
   472  	// val0 := br2.peekTopBits(peekBits)
   473  	MOVQ R11, R13
   474  	MOVQ DI, CX
   475  	SHRQ CL, R13
   476  
   477  	// v0 := table[val0&mask]
   478  	MOVW (R9)(R13*2), CX
   479  
   480  	// br2.advance(uint8(v0.entry)
   481  	MOVB CH, AL
   482  	SHLQ CL, R11
   483  	ADDB CL, R12
   484  
   485  	// val1 := br2.peekTopBits(peekBits)
   486  	MOVQ R11, R13
   487  	MOVQ DI, CX
   488  	SHRQ CL, R13
   489  
   490  	// v1 := table[val0&mask]
   491  	MOVW (R9)(R13*2), CX
   492  
   493  	// br2.advance(uint8(v1.entry)
   494  	MOVB   CH, AH
   495  	SHLQ   CL, R11
   496  	ADDB   CL, R12
   497  	BSWAPL AX
   498  
   499  	// val2 := br2.peekTopBits(peekBits)
   500  	MOVQ R11, R13
   501  	MOVQ DI, CX
   502  	SHRQ CL, R13
   503  
   504  	// v2 := table[val0&mask]
   505  	MOVW (R9)(R13*2), CX
   506  
   507  	// br2.advance(uint8(v2.entry)
   508  	MOVB CH, AH
   509  	SHLQ CL, R11
   510  	ADDB CL, R12
   511  
   512  	// val3 := br2.peekTopBits(peekBits)
   513  	MOVQ R11, R13
   514  	MOVQ DI, CX
   515  	SHRQ CL, R13
   516  
   517  	// v3 := table[val0&mask]
   518  	MOVW (R9)(R13*2), CX
   519  
   520  	// br2.advance(uint8(v3.entry)
   521  	MOVB   CH, AL
   522  	SHLQ   CL, R11
   523  	ADDB   CL, R12
   524  	BSWAPL AX
   525  
   526  	// these four writes get coalesced
   527  	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   528  	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   529  	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
   530  	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
   531  	MOVL AX, (BX)(R8*2)
   532  
   533  	// update the bitreader structure
   534  	MOVQ R11, 128(R10)
   535  	MOVB R12, 136(R10)
   536  
   537  	// br3.fillFast32()
   538  	MOVQ    176(R10), R11
   539  	MOVBQZX 184(R10), R12
   540  	CMPQ    R12, $0x20
   541  	JBE     skip_fill3
   542  	MOVQ    168(R10), R13
   543  	SUBQ    $0x20, R12
   544  	SUBQ    $0x04, R13
   545  	MOVQ    144(R10), R14
   546  
   547  	// b.value |= uint64(low) << (b.bitsRead & 63)
   548  	MOVL (R13)(R14*1), R14
   549  	MOVQ R12, CX
   550  	SHLQ CL, R14
   551  	MOVQ R13, 168(R10)
   552  	ORQ  R14, R11
   553  
   554  	// exhausted += (br3.off < 4)
   555  	CMPQ R13, $0x04
   556  	ADCB $+0, DL
   557  
   558  skip_fill3:
   559  	// val0 := br3.peekTopBits(peekBits)
   560  	MOVQ R11, R13
   561  	MOVQ DI, CX
   562  	SHRQ CL, R13
   563  
   564  	// v0 := table[val0&mask]
   565  	MOVW (R9)(R13*2), CX
   566  
   567  	// br3.advance(uint8(v0.entry)
   568  	MOVB CH, AL
   569  	SHLQ CL, R11
   570  	ADDB CL, R12
   571  
   572  	// val1 := br3.peekTopBits(peekBits)
   573  	MOVQ R11, R13
   574  	MOVQ DI, CX
   575  	SHRQ CL, R13
   576  
   577  	// v1 := table[val0&mask]
   578  	MOVW (R9)(R13*2), CX
   579  
   580  	// br3.advance(uint8(v1.entry)
   581  	MOVB   CH, AH
   582  	SHLQ   CL, R11
   583  	ADDB   CL, R12
   584  	BSWAPL AX
   585  
   586  	// val2 := br3.peekTopBits(peekBits)
   587  	MOVQ R11, R13
   588  	MOVQ DI, CX
   589  	SHRQ CL, R13
   590  
   591  	// v2 := table[val0&mask]
   592  	MOVW (R9)(R13*2), CX
   593  
   594  	// br3.advance(uint8(v2.entry)
   595  	MOVB CH, AH
   596  	SHLQ CL, R11
   597  	ADDB CL, R12
   598  
   599  	// val3 := br3.peekTopBits(peekBits)
   600  	MOVQ R11, R13
   601  	MOVQ DI, CX
   602  	SHRQ CL, R13
   603  
   604  	// v3 := table[val0&mask]
   605  	MOVW (R9)(R13*2), CX
   606  
   607  	// br3.advance(uint8(v3.entry)
   608  	MOVB   CH, AL
   609  	SHLQ   CL, R11
   610  	ADDB   CL, R12
   611  	BSWAPL AX
   612  
   613  	// these four writes get coalesced
   614  	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   615  	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   616  	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
   617  	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
   618  	LEAQ (R8)(R8*2), CX
   619  	MOVL AX, (BX)(CX*1)
   620  
   621  	// update the bitreader structure
   622  	MOVQ  R11, 176(R10)
   623  	MOVB  R12, 184(R10)
   624  	ADDQ  $0x04, BX
   625  	TESTB DL, DL
   626  	JZ    main_loop
   627  	MOVQ  ctx+0(FP), AX
   628  	SUBQ  16(AX), BX
   629  	SHLQ  $0x02, BX
   630  	MOVQ  BX, 40(AX)
   631  	RET
   632  
   633  // func decompress1x_main_loop_amd64(ctx *decompress1xContext)
   634  TEXT ·decompress1x_main_loop_amd64(SB), $0-8
   635  	MOVQ    ctx+0(FP), CX
   636  	MOVQ    16(CX), DX
   637  	MOVQ    24(CX), BX
   638  	CMPQ    BX, $0x04
   639  	JB      error_max_decoded_size_exceeded
   640  	LEAQ    (DX)(BX*1), BX
   641  	MOVQ    (CX), SI
   642  	MOVQ    (SI), R8
   643  	MOVQ    24(SI), R9
   644  	MOVQ    32(SI), R10
   645  	MOVBQZX 40(SI), R11
   646  	MOVQ    32(CX), SI
   647  	MOVBQZX 8(CX), DI
   648  	JMP     loop_condition
   649  
   650  main_loop:
   651  	// Check if we have room for 4 bytes in the output buffer
   652  	LEAQ 4(DX), CX
   653  	CMPQ CX, BX
   654  	JGE  error_max_decoded_size_exceeded
   655  
   656  	// Decode 4 values
   657  	CMPQ R11, $0x20
   658  	JL   bitReader_fillFast_1_end
   659  	SUBQ $0x20, R11
   660  	SUBQ $0x04, R9
   661  	MOVL (R8)(R9*1), R12
   662  	MOVQ R11, CX
   663  	SHLQ CL, R12
   664  	ORQ  R12, R10
   665  
   666  bitReader_fillFast_1_end:
   667  	MOVQ    DI, CX
   668  	MOVQ    R10, R12
   669  	SHRQ    CL, R12
   670  	MOVW    (SI)(R12*2), CX
   671  	MOVB    CH, AL
   672  	MOVBQZX CL, CX
   673  	ADDQ    CX, R11
   674  	SHLQ    CL, R10
   675  	MOVQ    DI, CX
   676  	MOVQ    R10, R12
   677  	SHRQ    CL, R12
   678  	MOVW    (SI)(R12*2), CX
   679  	MOVB    CH, AH
   680  	MOVBQZX CL, CX
   681  	ADDQ    CX, R11
   682  	SHLQ    CL, R10
   683  	BSWAPL  AX
   684  	CMPQ    R11, $0x20
   685  	JL      bitReader_fillFast_2_end
   686  	SUBQ    $0x20, R11
   687  	SUBQ    $0x04, R9
   688  	MOVL    (R8)(R9*1), R12
   689  	MOVQ    R11, CX
   690  	SHLQ    CL, R12
   691  	ORQ     R12, R10
   692  
   693  bitReader_fillFast_2_end:
   694  	MOVQ    DI, CX
   695  	MOVQ    R10, R12
   696  	SHRQ    CL, R12
   697  	MOVW    (SI)(R12*2), CX
   698  	MOVB    CH, AH
   699  	MOVBQZX CL, CX
   700  	ADDQ    CX, R11
   701  	SHLQ    CL, R10
   702  	MOVQ    DI, CX
   703  	MOVQ    R10, R12
   704  	SHRQ    CL, R12
   705  	MOVW    (SI)(R12*2), CX
   706  	MOVB    CH, AL
   707  	MOVBQZX CL, CX
   708  	ADDQ    CX, R11
   709  	SHLQ    CL, R10
   710  	BSWAPL  AX
   711  
   712  	// Store the decoded values
   713  	MOVL AX, (DX)
   714  	ADDQ $0x04, DX
   715  
   716  loop_condition:
   717  	CMPQ R9, $0x08
   718  	JGE  main_loop
   719  
   720  	// Update ctx structure
   721  	MOVQ ctx+0(FP), AX
   722  	SUBQ 16(AX), DX
   723  	MOVQ DX, 40(AX)
   724  	MOVQ (AX), AX
   725  	MOVQ R9, 24(AX)
   726  	MOVQ R10, 32(AX)
   727  	MOVB R11, 40(AX)
   728  	RET
   729  
   730  	// Report error
   731  error_max_decoded_size_exceeded:
   732  	MOVQ ctx+0(FP), AX
   733  	MOVQ $-1, CX
   734  	MOVQ CX, 40(AX)
   735  	RET
   736  
   737  // func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
   738  // Requires: BMI2
   739  TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
   740  	MOVQ    ctx+0(FP), CX
   741  	MOVQ    16(CX), DX
   742  	MOVQ    24(CX), BX
   743  	CMPQ    BX, $0x04
   744  	JB      error_max_decoded_size_exceeded
   745  	LEAQ    (DX)(BX*1), BX
   746  	MOVQ    (CX), SI
   747  	MOVQ    (SI), R8
   748  	MOVQ    24(SI), R9
   749  	MOVQ    32(SI), R10
   750  	MOVBQZX 40(SI), R11
   751  	MOVQ    32(CX), SI
   752  	MOVBQZX 8(CX), DI
   753  	JMP     loop_condition
   754  
   755  main_loop:
   756  	// Check if we have room for 4 bytes in the output buffer
   757  	LEAQ 4(DX), CX
   758  	CMPQ CX, BX
   759  	JGE  error_max_decoded_size_exceeded
   760  
   761  	// Decode 4 values
   762  	CMPQ  R11, $0x20
   763  	JL    bitReader_fillFast_1_end
   764  	SUBQ  $0x20, R11
   765  	SUBQ  $0x04, R9
   766  	MOVL  (R8)(R9*1), CX
   767  	SHLXQ R11, CX, CX
   768  	ORQ   CX, R10
   769  
   770  bitReader_fillFast_1_end:
   771  	SHRXQ   DI, R10, CX
   772  	MOVW    (SI)(CX*2), CX
   773  	MOVB    CH, AL
   774  	MOVBQZX CL, CX
   775  	ADDQ    CX, R11
   776  	SHLXQ   CX, R10, R10
   777  	SHRXQ   DI, R10, CX
   778  	MOVW    (SI)(CX*2), CX
   779  	MOVB    CH, AH
   780  	MOVBQZX CL, CX
   781  	ADDQ    CX, R11
   782  	SHLXQ   CX, R10, R10
   783  	BSWAPL  AX
   784  	CMPQ    R11, $0x20
   785  	JL      bitReader_fillFast_2_end
   786  	SUBQ    $0x20, R11
   787  	SUBQ    $0x04, R9
   788  	MOVL    (R8)(R9*1), CX
   789  	SHLXQ   R11, CX, CX
   790  	ORQ     CX, R10
   791  
   792  bitReader_fillFast_2_end:
   793  	SHRXQ   DI, R10, CX
   794  	MOVW    (SI)(CX*2), CX
   795  	MOVB    CH, AH
   796  	MOVBQZX CL, CX
   797  	ADDQ    CX, R11
   798  	SHLXQ   CX, R10, R10
   799  	SHRXQ   DI, R10, CX
   800  	MOVW    (SI)(CX*2), CX
   801  	MOVB    CH, AL
   802  	MOVBQZX CL, CX
   803  	ADDQ    CX, R11
   804  	SHLXQ   CX, R10, R10
   805  	BSWAPL  AX
   806  
   807  	// Store the decoded values
   808  	MOVL AX, (DX)
   809  	ADDQ $0x04, DX
   810  
   811  loop_condition:
   812  	CMPQ R9, $0x08
   813  	JGE  main_loop
   814  
   815  	// Update ctx structure
   816  	MOVQ ctx+0(FP), AX
   817  	SUBQ 16(AX), DX
   818  	MOVQ DX, 40(AX)
   819  	MOVQ (AX), AX
   820  	MOVQ R9, 24(AX)
   821  	MOVQ R10, 32(AX)
   822  	MOVB R11, 40(AX)
   823  	RET
   824  
   825  	// Report error
   826  error_max_decoded_size_exceeded:
   827  	MOVQ ctx+0(FP), AX
   828  	MOVQ $-1, CX
   829  	MOVQ CX, 40(AX)
   830  	RET