github.com/piotrnar/gocoin@v0.0.0-20240512203912-faa0448c5e96/lib/others/snappy/encode_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !appengine
     6  // +build gc
     7  // +build !noasm
     8  
     9  #include "textflag.h"
    10  
    11  // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
    12  // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
    13  // https://github.com/golang/snappy/issues/29
    14  //
    15  // As a workaround, the package was built with a known good assembler, and
    16  // those instructions were disassembled by "objdump -d" to yield the
    17  //	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
    18  // style comments, in AT&T asm syntax. Note that rsp here is a physical
    19  // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
    20  // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
    21  // fine on Go 1.6.
    22  
    23  // The asm code generally follows the pure Go code in encode_other.go, except
    24  // where marked with a "!!!".
    25  
    26  // ----------------------------------------------------------------------------
    27  
    28  // func emitLiteral(dst, lit []byte) int
    29  //
    30  // All local variables fit into registers. The register allocation:
    31  //	- AX	len(lit)
    32  //	- BX	n
    33  //	- DX	return value
    34  //	- DI	&dst[i]
    35  //	- R10	&lit[0]
    36  //
    37  // The 24 bytes of stack space is to call runtime·memmove.
    38  //
    39  // The unusual register allocation of local variables, such as R10 for the
    40  // source pointer, matches the allocation used at the call site in encodeBlock,
    41  // which makes it easier to manually inline this function.
    42  TEXT ·emitLiteral(SB), NOSPLIT, $24-56
    43  	MOVQ dst_base+0(FP), DI
    44  	MOVQ lit_base+24(FP), R10
    45  	MOVQ lit_len+32(FP), AX
    46  	MOVQ AX, DX
    47  	MOVL AX, BX
    48  	SUBL $1, BX
    49  
    50  	CMPL BX, $60
    51  	JLT  oneByte
    52  	CMPL BX, $256
    53  	JLT  twoBytes
    54  
    55  threeBytes:
    56  	MOVB $0xf4, 0(DI)
    57  	MOVW BX, 1(DI)
    58  	ADDQ $3, DI
    59  	ADDQ $3, DX
    60  	JMP  memmove
    61  
    62  twoBytes:
    63  	MOVB $0xf0, 0(DI)
    64  	MOVB BX, 1(DI)
    65  	ADDQ $2, DI
    66  	ADDQ $2, DX
    67  	JMP  memmove
    68  
    69  oneByte:
    70  	SHLB $2, BX
    71  	MOVB BX, 0(DI)
    72  	ADDQ $1, DI
    73  	ADDQ $1, DX
    74  
    75  memmove:
    76  	MOVQ DX, ret+48(FP)
    77  
    78  	// copy(dst[i:], lit)
    79  	//
    80  	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
    81  	// DI, R10 and AX as arguments.
    82  	MOVQ DI, 0(SP)
    83  	MOVQ R10, 8(SP)
    84  	MOVQ AX, 16(SP)
    85  	CALL runtime·memmove(SB)
    86  	RET
    87  
    88  // ----------------------------------------------------------------------------
    89  
    90  // func emitCopy(dst []byte, offset, length int) int
    91  //
    92  // All local variables fit into registers. The register allocation:
    93  //	- AX	length
    94  //	- SI	&dst[0]
    95  //	- DI	&dst[i]
    96  //	- R11	offset
    97  //
    98  // The unusual register allocation of local variables, such as R11 for the
    99  // offset, matches the allocation used at the call site in encodeBlock, which
   100  // makes it easier to manually inline this function.
   101  TEXT ·emitCopy(SB), NOSPLIT, $0-48
   102  	MOVQ dst_base+0(FP), DI
   103  	MOVQ DI, SI
   104  	MOVQ offset+24(FP), R11
   105  	MOVQ length+32(FP), AX
   106  
   107  loop0:
   108  	// for length >= 68 { etc }
   109  	CMPL AX, $68
   110  	JLT  step1
   111  
   112  	// Emit a length 64 copy, encoded as 3 bytes.
   113  	MOVB $0xfe, 0(DI)
   114  	MOVW R11, 1(DI)
   115  	ADDQ $3, DI
   116  	SUBL $64, AX
   117  	JMP  loop0
   118  
   119  step1:
   120  	// if length > 64 { etc }
   121  	CMPL AX, $64
   122  	JLE  step2
   123  
   124  	// Emit a length 60 copy, encoded as 3 bytes.
   125  	MOVB $0xee, 0(DI)
   126  	MOVW R11, 1(DI)
   127  	ADDQ $3, DI
   128  	SUBL $60, AX
   129  
   130  step2:
   131  	// if length >= 12 || offset >= 2048 { goto step3 }
   132  	CMPL AX, $12
   133  	JGE  step3
   134  	CMPL R11, $2048
   135  	JGE  step3
   136  
   137  	// Emit the remaining copy, encoded as 2 bytes.
   138  	MOVB R11, 1(DI)
   139  	SHRL $8, R11
   140  	SHLB $5, R11
   141  	SUBB $4, AX
   142  	SHLB $2, AX
   143  	ORB  AX, R11
   144  	ORB  $1, R11
   145  	MOVB R11, 0(DI)
   146  	ADDQ $2, DI
   147  
   148  	// Return the number of bytes written.
   149  	SUBQ SI, DI
   150  	MOVQ DI, ret+40(FP)
   151  	RET
   152  
   153  step3:
   154  	// Emit the remaining copy, encoded as 3 bytes.
   155  	SUBL $1, AX
   156  	SHLB $2, AX
   157  	ORB  $2, AX
   158  	MOVB AX, 0(DI)
   159  	MOVW R11, 1(DI)
   160  	ADDQ $3, DI
   161  
   162  	// Return the number of bytes written.
   163  	SUBQ SI, DI
   164  	MOVQ DI, ret+40(FP)
   165  	RET
   166  
   167  // ----------------------------------------------------------------------------
   168  
   169  // func extendMatch(src []byte, i, j int) int
   170  //
   171  // All local variables fit into registers. The register allocation:
   172  //	- DX	&src[0]
   173  //	- SI	&src[j]
   174  //	- R13	&src[len(src) - 8]
   175  //	- R14	&src[len(src)]
   176  //	- R15	&src[i]
   177  //
   178  // The unusual register allocation of local variables, such as R15 for a source
   179  // pointer, matches the allocation used at the call site in encodeBlock, which
   180  // makes it easier to manually inline this function.
   181  TEXT ·extendMatch(SB), NOSPLIT, $0-48
   182  	MOVQ src_base+0(FP), DX
   183  	MOVQ src_len+8(FP), R14
   184  	MOVQ i+24(FP), R15
   185  	MOVQ j+32(FP), SI
   186  	ADDQ DX, R14
   187  	ADDQ DX, R15
   188  	ADDQ DX, SI
   189  	MOVQ R14, R13
   190  	SUBQ $8, R13
   191  
   192  cmp8:
   193  	// As long as we are 8 or more bytes before the end of src, we can load and
   194  	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
   195  	CMPQ SI, R13
   196  	JA   cmp1
   197  	MOVQ (R15), AX
   198  	MOVQ (SI), BX
   199  	CMPQ AX, BX
   200  	JNE  bsf
   201  	ADDQ $8, R15
   202  	ADDQ $8, SI
   203  	JMP  cmp8
   204  
   205  bsf:
   206  	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
   207  	// the index of the first byte that differs. The BSF instruction finds the
   208  	// least significant 1 bit, the amd64 architecture is little-endian, and
   209  	// the shift by 3 converts a bit index to a byte index.
   210  	XORQ AX, BX
   211  	BSFQ BX, BX
   212  	SHRQ $3, BX
   213  	ADDQ BX, SI
   214  
   215  	// Convert from &src[ret] to ret.
   216  	SUBQ DX, SI
   217  	MOVQ SI, ret+40(FP)
   218  	RET
   219  
   220  cmp1:
   221  	// In src's tail, compare 1 byte at a time.
   222  	CMPQ SI, R14
   223  	JAE  extendMatchEnd
   224  	MOVB (R15), AX
   225  	MOVB (SI), BX
   226  	CMPB AX, BX
   227  	JNE  extendMatchEnd
   228  	ADDQ $1, R15
   229  	ADDQ $1, SI
   230  	JMP  cmp1
   231  
   232  extendMatchEnd:
   233  	// Convert from &src[ret] to ret.
   234  	SUBQ DX, SI
   235  	MOVQ SI, ret+40(FP)
   236  	RET
   237  
   238  // ----------------------------------------------------------------------------
   239  
   240  // func encodeBlock(dst, src []byte) (d int)
   241  //
   242  // All local variables fit into registers, other than "var table". The register
   243  // allocation:
   244  //	- AX	.	.
   245  //	- BX	.	.
   246  //	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
   247  //	- DX	64	&src[0], tableSize
   248  //	- SI	72	&src[s]
   249  //	- DI	80	&dst[d]
   250  //	- R9	88	sLimit
   251  //	- R10	.	&src[nextEmit]
   252  //	- R11	96	prevHash, currHash, nextHash, offset
   253  //	- R12	104	&src[base], skip
   254  //	- R13	.	&src[nextS], &src[len(src) - 8]
   255  //	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
   256  //	- R15	112	candidate
   257  //
   258  // The second column (56, 64, etc) is the stack offset to spill the registers
   259  // when calling other functions. We could pack this slightly tighter, but it's
   260  // simpler to have a dedicated spill map independent of the function called.
   261  //
   262  // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
   263  // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
   264  // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
   265  TEXT ·encodeBlock(SB), 0, $32888-56
   266  	MOVQ dst_base+0(FP), DI
   267  	MOVQ src_base+24(FP), SI
   268  	MOVQ src_len+32(FP), R14
   269  
   270  	// shift, tableSize := uint32(32-8), 1<<8
   271  	MOVQ $24, CX
   272  	MOVQ $256, DX
   273  
   274  calcShift:
   275  	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
   276  	//	shift--
   277  	// }
   278  	CMPQ DX, $16384
   279  	JGE  varTable
   280  	CMPQ DX, R14
   281  	JGE  varTable
   282  	SUBQ $1, CX
   283  	SHLQ $1, DX
   284  	JMP  calcShift
   285  
   286  varTable:
   287  	// var table [maxTableSize]uint16
   288  	//
   289  	// In the asm code, unlike the Go code, we can zero-initialize only the
   290  	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
   291  	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
   292  	// 2048 writes that would zero-initialize all of table's 32768 bytes.
   293  	SHRQ $3, DX
   294  	LEAQ table-32768(SP), BX
   295  	PXOR X0, X0
   296  
   297  memclr:
   298  	MOVOU X0, 0(BX)
   299  	ADDQ  $16, BX
   300  	SUBQ  $1, DX
   301  	JNZ   memclr
   302  
   303  	// !!! DX = &src[0]
   304  	MOVQ SI, DX
   305  
   306  	// sLimit := len(src) - inputMargin
   307  	MOVQ R14, R9
   308  	SUBQ $15, R9
   309  
   310  	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
   311  	// change for the rest of the function.
   312  	MOVQ CX, 56(SP)
   313  	MOVQ DX, 64(SP)
   314  	MOVQ R9, 88(SP)
   315  
   316  	// nextEmit := 0
   317  	MOVQ DX, R10
   318  
   319  	// s := 1
   320  	ADDQ $1, SI
   321  
   322  	// nextHash := hash(load32(src, s), shift)
   323  	MOVL  0(SI), R11
   324  	IMULL $0x1e35a7bd, R11
   325  	SHRL  CX, R11
   326  
   327  outer:
   328  	// for { etc }
   329  
   330  	// skip := 32
   331  	MOVQ $32, R12
   332  
   333  	// nextS := s
   334  	MOVQ SI, R13
   335  
   336  	// candidate := 0
   337  	MOVQ $0, R15
   338  
   339  inner0:
   340  	// for { etc }
   341  
   342  	// s := nextS
   343  	MOVQ R13, SI
   344  
   345  	// bytesBetweenHashLookups := skip >> 5
   346  	MOVQ R12, R14
   347  	SHRQ $5, R14
   348  
   349  	// nextS = s + bytesBetweenHashLookups
   350  	ADDQ R14, R13
   351  
   352  	// skip += bytesBetweenHashLookups
   353  	ADDQ R14, R12
   354  
   355  	// if nextS > sLimit { goto emitRemainder }
   356  	MOVQ R13, AX
   357  	SUBQ DX, AX
   358  	CMPQ AX, R9
   359  	JA   emitRemainder
   360  
   361  	// candidate = int(table[nextHash])
   362  	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
   363  	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
   364  	BYTE $0x4e
   365  	BYTE $0x0f
   366  	BYTE $0xb7
   367  	BYTE $0x7c
   368  	BYTE $0x5c
   369  	BYTE $0x78
   370  
   371  	// table[nextHash] = uint16(s)
   372  	MOVQ SI, AX
   373  	SUBQ DX, AX
   374  
   375  	// XXX: MOVW AX, table-32768(SP)(R11*2)
   376  	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
   377  	BYTE $0x66
   378  	BYTE $0x42
   379  	BYTE $0x89
   380  	BYTE $0x44
   381  	BYTE $0x5c
   382  	BYTE $0x78
   383  
   384  	// nextHash = hash(load32(src, nextS), shift)
   385  	MOVL  0(R13), R11
   386  	IMULL $0x1e35a7bd, R11
   387  	SHRL  CX, R11
   388  
   389  	// if load32(src, s) != load32(src, candidate) { continue } break
   390  	MOVL 0(SI), AX
   391  	MOVL (DX)(R15*1), BX
   392  	CMPL AX, BX
   393  	JNE  inner0
   394  
   395  fourByteMatch:
   396  	// As per the encode_other.go code:
   397  	//
   398  	// A 4-byte match has been found. We'll later see etc.
   399  
   400  	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
   401  	// on inputMargin in encode.go.
   402  	MOVQ SI, AX
   403  	SUBQ R10, AX
   404  	CMPQ AX, $16
   405  	JLE  emitLiteralFastPath
   406  
   407  	// ----------------------------------------
   408  	// Begin inline of the emitLiteral call.
   409  	//
   410  	// d += emitLiteral(dst[d:], src[nextEmit:s])
   411  
   412  	MOVL AX, BX
   413  	SUBL $1, BX
   414  
   415  	CMPL BX, $60
   416  	JLT  inlineEmitLiteralOneByte
   417  	CMPL BX, $256
   418  	JLT  inlineEmitLiteralTwoBytes
   419  
   420  inlineEmitLiteralThreeBytes:
   421  	MOVB $0xf4, 0(DI)
   422  	MOVW BX, 1(DI)
   423  	ADDQ $3, DI
   424  	JMP  inlineEmitLiteralMemmove
   425  
   426  inlineEmitLiteralTwoBytes:
   427  	MOVB $0xf0, 0(DI)
   428  	MOVB BX, 1(DI)
   429  	ADDQ $2, DI
   430  	JMP  inlineEmitLiteralMemmove
   431  
   432  inlineEmitLiteralOneByte:
   433  	SHLB $2, BX
   434  	MOVB BX, 0(DI)
   435  	ADDQ $1, DI
   436  
   437  inlineEmitLiteralMemmove:
   438  	// Spill local variables (registers) onto the stack; call; unspill.
   439  	//
   440  	// copy(dst[i:], lit)
   441  	//
   442  	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
   443  	// DI, R10 and AX as arguments.
   444  	MOVQ DI, 0(SP)
   445  	MOVQ R10, 8(SP)
   446  	MOVQ AX, 16(SP)
   447  	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
   448  	MOVQ SI, 72(SP)
   449  	MOVQ DI, 80(SP)
   450  	MOVQ R15, 112(SP)
   451  	CALL runtime·memmove(SB)
   452  	MOVQ 56(SP), CX
   453  	MOVQ 64(SP), DX
   454  	MOVQ 72(SP), SI
   455  	MOVQ 80(SP), DI
   456  	MOVQ 88(SP), R9
   457  	MOVQ 112(SP), R15
   458  	JMP  inner1
   459  
   460  inlineEmitLiteralEnd:
   461  	// End inline of the emitLiteral call.
   462  	// ----------------------------------------
   463  
   464  emitLiteralFastPath:
   465  	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
   466  	MOVB AX, BX
   467  	SUBB $1, BX
   468  	SHLB $2, BX
   469  	MOVB BX, (DI)
   470  	ADDQ $1, DI
   471  
   472  	// !!! Implement the copy from lit to dst as a 16-byte load and store.
   473  	// (Encode's documentation says that dst and src must not overlap.)
   474  	//
   475  	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
   476  	// OK. Subsequent iterations will fix up the overrun.
   477  	//
   478  	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
   479  	// 16-byte loads and stores. This technique probably wouldn't be as
   480  	// effective on architectures that are fussier about alignment.
   481  	MOVOU 0(R10), X0
   482  	MOVOU X0, 0(DI)
   483  	ADDQ  AX, DI
   484  
   485  inner1:
   486  	// for { etc }
   487  
   488  	// base := s
   489  	MOVQ SI, R12
   490  
   491  	// !!! offset := base - candidate
   492  	MOVQ R12, R11
   493  	SUBQ R15, R11
   494  	SUBQ DX, R11
   495  
   496  	// ----------------------------------------
   497  	// Begin inline of the extendMatch call.
   498  	//
   499  	// s = extendMatch(src, candidate+4, s+4)
   500  
   501  	// !!! R14 = &src[len(src)]
   502  	MOVQ src_len+32(FP), R14
   503  	ADDQ DX, R14
   504  
   505  	// !!! R13 = &src[len(src) - 8]
   506  	MOVQ R14, R13
   507  	SUBQ $8, R13
   508  
   509  	// !!! R15 = &src[candidate + 4]
   510  	ADDQ $4, R15
   511  	ADDQ DX, R15
   512  
   513  	// !!! s += 4
   514  	ADDQ $4, SI
   515  
   516  inlineExtendMatchCmp8:
   517  	// As long as we are 8 or more bytes before the end of src, we can load and
   518  	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
   519  	CMPQ SI, R13
   520  	JA   inlineExtendMatchCmp1
   521  	MOVQ (R15), AX
   522  	MOVQ (SI), BX
   523  	CMPQ AX, BX
   524  	JNE  inlineExtendMatchBSF
   525  	ADDQ $8, R15
   526  	ADDQ $8, SI
   527  	JMP  inlineExtendMatchCmp8
   528  
   529  inlineExtendMatchBSF:
   530  	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
   531  	// the index of the first byte that differs. The BSF instruction finds the
   532  	// least significant 1 bit, the amd64 architecture is little-endian, and
   533  	// the shift by 3 converts a bit index to a byte index.
   534  	XORQ AX, BX
   535  	BSFQ BX, BX
   536  	SHRQ $3, BX
   537  	ADDQ BX, SI
   538  	JMP  inlineExtendMatchEnd
   539  
   540  inlineExtendMatchCmp1:
   541  	// In src's tail, compare 1 byte at a time.
   542  	CMPQ SI, R14
   543  	JAE  inlineExtendMatchEnd
   544  	MOVB (R15), AX
   545  	MOVB (SI), BX
   546  	CMPB AX, BX
   547  	JNE  inlineExtendMatchEnd
   548  	ADDQ $1, R15
   549  	ADDQ $1, SI
   550  	JMP  inlineExtendMatchCmp1
   551  
   552  inlineExtendMatchEnd:
   553  	// End inline of the extendMatch call.
   554  	// ----------------------------------------
   555  
   556  	// ----------------------------------------
   557  	// Begin inline of the emitCopy call.
   558  	//
   559  	// d += emitCopy(dst[d:], base-candidate, s-base)
   560  
   561  	// !!! length := s - base
   562  	MOVQ SI, AX
   563  	SUBQ R12, AX
   564  
   565  inlineEmitCopyLoop0:
   566  	// for length >= 68 { etc }
   567  	CMPL AX, $68
   568  	JLT  inlineEmitCopyStep1
   569  
   570  	// Emit a length 64 copy, encoded as 3 bytes.
   571  	MOVB $0xfe, 0(DI)
   572  	MOVW R11, 1(DI)
   573  	ADDQ $3, DI
   574  	SUBL $64, AX
   575  	JMP  inlineEmitCopyLoop0
   576  
   577  inlineEmitCopyStep1:
   578  	// if length > 64 { etc }
   579  	CMPL AX, $64
   580  	JLE  inlineEmitCopyStep2
   581  
   582  	// Emit a length 60 copy, encoded as 3 bytes.
   583  	MOVB $0xee, 0(DI)
   584  	MOVW R11, 1(DI)
   585  	ADDQ $3, DI
   586  	SUBL $60, AX
   587  
   588  inlineEmitCopyStep2:
   589  	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
   590  	CMPL AX, $12
   591  	JGE  inlineEmitCopyStep3
   592  	CMPL R11, $2048
   593  	JGE  inlineEmitCopyStep3
   594  
   595  	// Emit the remaining copy, encoded as 2 bytes.
   596  	MOVB R11, 1(DI)
   597  	SHRL $8, R11
   598  	SHLB $5, R11
   599  	SUBB $4, AX
   600  	SHLB $2, AX
   601  	ORB  AX, R11
   602  	ORB  $1, R11
   603  	MOVB R11, 0(DI)
   604  	ADDQ $2, DI
   605  	JMP  inlineEmitCopyEnd
   606  
   607  inlineEmitCopyStep3:
   608  	// Emit the remaining copy, encoded as 3 bytes.
   609  	SUBL $1, AX
   610  	SHLB $2, AX
   611  	ORB  $2, AX
   612  	MOVB AX, 0(DI)
   613  	MOVW R11, 1(DI)
   614  	ADDQ $3, DI
   615  
   616  inlineEmitCopyEnd:
   617  	// End inline of the emitCopy call.
   618  	// ----------------------------------------
   619  
   620  	// nextEmit = s
   621  	MOVQ SI, R10
   622  
   623  	// if s >= sLimit { goto emitRemainder }
   624  	MOVQ SI, AX
   625  	SUBQ DX, AX
   626  	CMPQ AX, R9
   627  	JAE  emitRemainder
   628  
   629  	// As per the encode_other.go code:
   630  	//
   631  	// We could immediately etc.
   632  
   633  	// x := load64(src, s-1)
   634  	MOVQ -1(SI), R14
   635  
   636  	// prevHash := hash(uint32(x>>0), shift)
   637  	MOVL  R14, R11
   638  	IMULL $0x1e35a7bd, R11
   639  	SHRL  CX, R11
   640  
   641  	// table[prevHash] = uint16(s-1)
   642  	MOVQ SI, AX
   643  	SUBQ DX, AX
   644  	SUBQ $1, AX
   645  
   646  	// XXX: MOVW AX, table-32768(SP)(R11*2)
   647  	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
   648  	BYTE $0x66
   649  	BYTE $0x42
   650  	BYTE $0x89
   651  	BYTE $0x44
   652  	BYTE $0x5c
   653  	BYTE $0x78
   654  
   655  	// currHash := hash(uint32(x>>8), shift)
   656  	SHRQ  $8, R14
   657  	MOVL  R14, R11
   658  	IMULL $0x1e35a7bd, R11
   659  	SHRL  CX, R11
   660  
   661  	// candidate = int(table[currHash])
   662  	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
   663  	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
   664  	BYTE $0x4e
   665  	BYTE $0x0f
   666  	BYTE $0xb7
   667  	BYTE $0x7c
   668  	BYTE $0x5c
   669  	BYTE $0x78
   670  
   671  	// table[currHash] = uint16(s)
   672  	ADDQ $1, AX
   673  
   674  	// XXX: MOVW AX, table-32768(SP)(R11*2)
   675  	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
   676  	BYTE $0x66
   677  	BYTE $0x42
   678  	BYTE $0x89
   679  	BYTE $0x44
   680  	BYTE $0x5c
   681  	BYTE $0x78
   682  
   683  	// if uint32(x>>8) == load32(src, candidate) { continue }
   684  	MOVL (DX)(R15*1), BX
   685  	CMPL R14, BX
   686  	JEQ  inner1
   687  
   688  	// nextHash = hash(uint32(x>>16), shift)
   689  	SHRQ  $8, R14
   690  	MOVL  R14, R11
   691  	IMULL $0x1e35a7bd, R11
   692  	SHRL  CX, R11
   693  
   694  	// s++
   695  	ADDQ $1, SI
   696  
   697  	// break out of the inner1 for loop, i.e. continue the outer loop.
   698  	JMP outer
   699  
   700  emitRemainder:
   701  	// if nextEmit < len(src) { etc }
   702  	MOVQ src_len+32(FP), AX
   703  	ADDQ DX, AX
   704  	CMPQ R10, AX
   705  	JEQ  encodeBlockEnd
   706  
   707  	// d += emitLiteral(dst[d:], src[nextEmit:])
   708  	//
   709  	// Push args.
   710  	MOVQ DI, 0(SP)
   711  	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
   712  	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
   713  	MOVQ R10, 24(SP)
   714  	SUBQ R10, AX
   715  	MOVQ AX, 32(SP)
   716  	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
   717  
   718  	// Spill local variables (registers) onto the stack; call; unspill.
   719  	MOVQ DI, 80(SP)
   720  	CALL ·emitLiteral(SB)
   721  	MOVQ 80(SP), DI
   722  
   723  	// Finish the "d +=" part of "d += emitLiteral(etc)".
   724  	ADDQ 48(SP), DI
   725  
   726  encodeBlockEnd:
   727  	MOVQ dst_base+0(FP), AX
   728  	SUBQ AX, DI
   729  	MOVQ DI, d+48(FP)
   730  	RET