github.com/piotrnar/gocoin@v0.0.0-20240512203912-faa0448c5e96/lib/others/snappy/decode_amd64.s

github.com/piotrnar/gocoin@v0.0.0-20240512203912-faa0448c5e96/lib/others/snappy/decode_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !appengine
     6  // +build gc
     7  // +build !noasm
     8  
     9  #include "textflag.h"
    10  
    11  // The asm code generally follows the pure Go code in decode_other.go, except
    12  // where marked with a "!!!".
    13  
    14  // func decode(dst, src []byte) int
    15  //
    16  // All local variables fit into registers. The non-zero stack size is only to
    17  // spill registers and push args when issuing a CALL. The register allocation:
    18  //	- AX	scratch
    19  //	- BX	scratch
    20  //	- CX	length or x
    21  //	- DX	offset
    22  //	- SI	&src[s]
    23  //	- DI	&dst[d]
    24  //	+ R8	dst_base
    25  //	+ R9	dst_len
    26  //	+ R10	dst_base + dst_len
    27  //	+ R11	src_base
    28  //	+ R12	src_len
    29  //	+ R13	src_base + src_len
    30  //	- R14	used by doCopy
    31  //	- R15	used by doCopy
    32  //
    33  // The registers R8-R13 (marked with a "+") are set at the start of the
    34  // function, and after a CALL returns, and are not otherwise modified.
    35  //
    36  // The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
    37  // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
    38  TEXT ·decode(SB), NOSPLIT, $48-56
    39  	// Initialize SI, DI and R8-R13.
    40  	MOVQ dst_base+0(FP), R8
    41  	MOVQ dst_len+8(FP), R9
    42  	MOVQ R8, DI
    43  	MOVQ R8, R10
    44  	ADDQ R9, R10
    45  	MOVQ src_base+24(FP), R11
    46  	MOVQ src_len+32(FP), R12
    47  	MOVQ R11, SI
    48  	MOVQ R11, R13
    49  	ADDQ R12, R13
    50  
    51  loop:
    52  	// for s < len(src)
    53  	CMPQ SI, R13
    54  	JEQ  end
    55  
    56  	// CX = uint32(src[s])
    57  	//
    58  	// switch src[s] & 0x03
    59  	MOVBLZX (SI), CX
    60  	MOVL    CX, BX
    61  	ANDL    $3, BX
    62  	CMPL    BX, $1
    63  	JAE     tagCopy
    64  
    65  	// ----------------------------------------
    66  	// The code below handles literal tags.
    67  
    68  	// case tagLiteral:
    69  	// x := uint32(src[s] >> 2)
    70  	// switch
    71  	SHRL $2, CX
    72  	CMPL CX, $60
    73  	JAE  tagLit60Plus
    74  
    75  	// case x < 60:
    76  	// s++
    77  	INCQ SI
    78  
    79  doLit:
    80  	// This is the end of the inner "switch", when we have a literal tag.
    81  	//
    82  	// We assume that CX == x and x fits in a uint32, where x is the variable
    83  	// used in the pure Go decode_other.go code.
    84  
    85  	// length = int(x) + 1
    86  	//
    87  	// Unlike the pure Go code, we don't need to check if length <= 0 because
    88  	// CX can hold 64 bits, so the increment cannot overflow.
    89  	INCQ CX
    90  
    91  	// Prepare to check if copying length bytes will run past the end of dst or
    92  	// src.
    93  	//
    94  	// AX = len(dst) - d
    95  	// BX = len(src) - s
    96  	MOVQ R10, AX
    97  	SUBQ DI, AX
    98  	MOVQ R13, BX
    99  	SUBQ SI, BX
   100  
   101  	// !!! Try a faster technique for short (16 or fewer bytes) copies.
   102  	//
   103  	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
   104  	//   goto callMemmove // Fall back on calling runtime·memmove.
   105  	// }
   106  	//
   107  	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
   108  	// against 21 instead of 16, because it cannot assume that all of its input
   109  	// is contiguous in memory and so it needs to leave enough source bytes to
   110  	// read the next tag without refilling buffers, but Go's Decode assumes
   111  	// contiguousness (the src argument is a []byte).
   112  	CMPQ CX, $16
   113  	JGT  callMemmove
   114  	CMPQ AX, $16
   115  	JLT  callMemmove
   116  	CMPQ BX, $16
   117  	JLT  callMemmove
   118  
   119  	// !!! Implement the copy from src to dst as a 16-byte load and store.
   120  	// (Decode's documentation says that dst and src must not overlap.)
   121  	//
   122  	// This always copies 16 bytes, instead of only length bytes, but that's
   123  	// OK. If the input is a valid Snappy encoding then subsequent iterations
   124  	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
   125  	// non-nil error), so the overrun will be ignored.
   126  	//
   127  	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
   128  	// 16-byte loads and stores. This technique probably wouldn't be as
   129  	// effective on architectures that are fussier about alignment.
   130  	MOVOU 0(SI), X0
   131  	MOVOU X0, 0(DI)
   132  
   133  	// d += length
   134  	// s += length
   135  	ADDQ CX, DI
   136  	ADDQ CX, SI
   137  	JMP  loop
   138  
   139  callMemmove:
   140  	// if length > len(dst)-d || length > len(src)-s { etc }
   141  	CMPQ CX, AX
   142  	JGT  errCorrupt
   143  	CMPQ CX, BX
   144  	JGT  errCorrupt
   145  
   146  	// copy(dst[d:], src[s:s+length])
   147  	//
   148  	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
   149  	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
   150  	// three registers to the stack, to save local variables across the CALL.
   151  	MOVQ DI, 0(SP)
   152  	MOVQ SI, 8(SP)
   153  	MOVQ CX, 16(SP)
   154  	MOVQ DI, 24(SP)
   155  	MOVQ SI, 32(SP)
   156  	MOVQ CX, 40(SP)
   157  	CALL runtime·memmove(SB)
   158  
   159  	// Restore local variables: unspill registers from the stack and
   160  	// re-calculate R8-R13.
   161  	MOVQ 24(SP), DI
   162  	MOVQ 32(SP), SI
   163  	MOVQ 40(SP), CX
   164  	MOVQ dst_base+0(FP), R8
   165  	MOVQ dst_len+8(FP), R9
   166  	MOVQ R8, R10
   167  	ADDQ R9, R10
   168  	MOVQ src_base+24(FP), R11
   169  	MOVQ src_len+32(FP), R12
   170  	MOVQ R11, R13
   171  	ADDQ R12, R13
   172  
   173  	// d += length
   174  	// s += length
   175  	ADDQ CX, DI
   176  	ADDQ CX, SI
   177  	JMP  loop
   178  
   179  tagLit60Plus:
   180  	// !!! This fragment does the
   181  	//
   182  	// s += x - 58; if uint(s) > uint(len(src)) { etc }
   183  	//
   184  	// checks. In the asm version, we code it once instead of once per switch case.
   185  	ADDQ CX, SI
   186  	SUBQ $58, SI
   187  	MOVQ SI, BX
   188  	SUBQ R11, BX
   189  	CMPQ BX, R12
   190  	JA   errCorrupt
   191  
   192  	// case x == 60:
   193  	CMPL CX, $61
   194  	JEQ  tagLit61
   195  	JA   tagLit62Plus
   196  
   197  	// x = uint32(src[s-1])
   198  	MOVBLZX -1(SI), CX
   199  	JMP     doLit
   200  
   201  tagLit61:
   202  	// case x == 61:
   203  	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
   204  	MOVWLZX -2(SI), CX
   205  	JMP     doLit
   206  
   207  tagLit62Plus:
   208  	CMPL CX, $62
   209  	JA   tagLit63
   210  
   211  	// case x == 62:
   212  	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
   213  	MOVWLZX -3(SI), CX
   214  	MOVBLZX -1(SI), BX
   215  	SHLL    $16, BX
   216  	ORL     BX, CX
   217  	JMP     doLit
   218  
   219  tagLit63:
   220  	// case x == 63:
   221  	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
   222  	MOVL -4(SI), CX
   223  	JMP  doLit
   224  
   225  // The code above handles literal tags.
   226  // ----------------------------------------
   227  // The code below handles copy tags.
   228  
   229  tagCopy4:
   230  	// case tagCopy4:
   231  	// s += 5
   232  	ADDQ $5, SI
   233  
   234  	// if uint(s) > uint(len(src)) { etc }
   235  	MOVQ SI, BX
   236  	SUBQ R11, BX
   237  	CMPQ BX, R12
   238  	JA   errCorrupt
   239  
   240  	// length = 1 + int(src[s-5])>>2
   241  	SHRQ $2, CX
   242  	INCQ CX
   243  
   244  	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
   245  	MOVLQZX -4(SI), DX
   246  	JMP     doCopy
   247  
   248  tagCopy2:
   249  	// case tagCopy2:
   250  	// s += 3
   251  	ADDQ $3, SI
   252  
   253  	// if uint(s) > uint(len(src)) { etc }
   254  	MOVQ SI, BX
   255  	SUBQ R11, BX
   256  	CMPQ BX, R12
   257  	JA   errCorrupt
   258  
   259  	// length = 1 + int(src[s-3])>>2
   260  	SHRQ $2, CX
   261  	INCQ CX
   262  
   263  	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
   264  	MOVWQZX -2(SI), DX
   265  	JMP     doCopy
   266  
   267  tagCopy:
   268  	// We have a copy tag. We assume that:
   269  	//	- BX == src[s] & 0x03
   270  	//	- CX == src[s]
   271  	CMPQ BX, $2
   272  	JEQ  tagCopy2
   273  	JA   tagCopy4
   274  
   275  	// case tagCopy1:
   276  	// s += 2
   277  	ADDQ $2, SI
   278  
   279  	// if uint(s) > uint(len(src)) { etc }
   280  	MOVQ SI, BX
   281  	SUBQ R11, BX
   282  	CMPQ BX, R12
   283  	JA   errCorrupt
   284  
   285  	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
   286  	MOVQ    CX, DX
   287  	ANDQ    $0xe0, DX
   288  	SHLQ    $3, DX
   289  	MOVBQZX -1(SI), BX
   290  	ORQ     BX, DX
   291  
   292  	// length = 4 + int(src[s-2])>>2&0x7
   293  	SHRQ $2, CX
   294  	ANDQ $7, CX
   295  	ADDQ $4, CX
   296  
   297  doCopy:
   298  	// This is the end of the outer "switch", when we have a copy tag.
   299  	//
   300  	// We assume that:
   301  	//	- CX == length && CX > 0
   302  	//	- DX == offset
   303  
   304  	// if offset <= 0 { etc }
   305  	CMPQ DX, $0
   306  	JLE  errCorrupt
   307  
   308  	// if d < offset { etc }
   309  	MOVQ DI, BX
   310  	SUBQ R8, BX
   311  	CMPQ BX, DX
   312  	JLT  errCorrupt
   313  
   314  	// if length > len(dst)-d { etc }
   315  	MOVQ R10, BX
   316  	SUBQ DI, BX
   317  	CMPQ CX, BX
   318  	JGT  errCorrupt
   319  
   320  	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
   321  	//
   322  	// Set:
   323  	//	- R14 = len(dst)-d
   324  	//	- R15 = &dst[d-offset]
   325  	MOVQ R10, R14
   326  	SUBQ DI, R14
   327  	MOVQ DI, R15
   328  	SUBQ DX, R15
   329  
   330  	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
   331  	//
   332  	// First, try using two 8-byte load/stores, similar to the doLit technique
   333  	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
   334  	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
   335  	// and not one 16-byte load/store, and the first store has to be before the
   336  	// second load, due to the overlap if offset is in the range [8, 16).
   337  	//
   338  	// if length > 16 || offset < 8 || len(dst)-d < 16 {
   339  	//   goto slowForwardCopy
   340  	// }
   341  	// copy 16 bytes
   342  	// d += length
   343  	CMPQ CX, $16
   344  	JGT  slowForwardCopy
   345  	CMPQ DX, $8
   346  	JLT  slowForwardCopy
   347  	CMPQ R14, $16
   348  	JLT  slowForwardCopy
   349  	MOVQ 0(R15), AX
   350  	MOVQ AX, 0(DI)
   351  	MOVQ 8(R15), BX
   352  	MOVQ BX, 8(DI)
   353  	ADDQ CX, DI
   354  	JMP  loop
   355  
   356  slowForwardCopy:
   357  	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
   358  	// can still try 8-byte load stores, provided we can overrun up to 10 extra
   359  	// bytes. As above, the overrun will be fixed up by subsequent iterations
   360  	// of the outermost loop.
   361  	//
   362  	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
   363  	// commentary says:
   364  	//
   365  	// ----
   366  	//
   367  	// The main part of this loop is a simple copy of eight bytes at a time
   368  	// until we've copied (at least) the requested amount of bytes.  However,
   369  	// if d and d-offset are less than eight bytes apart (indicating a
   370  	// repeating pattern of length < 8), we first need to expand the pattern in
   371  	// order to get the correct results. For instance, if the buffer looks like
   372  	// this, with the eight-byte <d-offset> and <d> patterns marked as
   373  	// intervals:
   374  	//
   375  	//    abxxxxxxxxxxxx
   376  	//    [------]           d-offset
   377  	//      [------]         d
   378  	//
   379  	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
   380  	// once, after which we can move <d> two bytes without moving <d-offset>:
   381  	//
   382  	//    ababxxxxxxxxxx
   383  	//    [------]           d-offset
   384  	//        [------]       d
   385  	//
   386  	// and repeat the exercise until the two no longer overlap.
   387  	//
   388  	// This allows us to do very well in the special case of one single byte
   389  	// repeated many times, without taking a big hit for more general cases.
   390  	//
   391  	// The worst case of extra writing past the end of the match occurs when
   392  	// offset == 1 and length == 1; the last copy will read from byte positions
   393  	// [0..7] and write to [4..11], whereas it was only supposed to write to
   394  	// position 1. Thus, ten excess bytes.
   395  	//
   396  	// ----
   397  	//
   398  	// That "10 byte overrun" worst case is confirmed by Go's
   399  	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
   400  	// and finishSlowForwardCopy algorithm.
   401  	//
   402  	// if length > len(dst)-d-10 {
   403  	//   goto verySlowForwardCopy
   404  	// }
   405  	SUBQ $10, R14
   406  	CMPQ CX, R14
   407  	JGT  verySlowForwardCopy
   408  
   409  makeOffsetAtLeast8:
   410  	// !!! As above, expand the pattern so that offset >= 8 and we can use
   411  	// 8-byte load/stores.
   412  	//
   413  	// for offset < 8 {
   414  	//   copy 8 bytes from dst[d-offset:] to dst[d:]
   415  	//   length -= offset
   416  	//   d      += offset
   417  	//   offset += offset
   418  	//   // The two previous lines together means that d-offset, and therefore
   419  	//   // R15, is unchanged.
   420  	// }
   421  	CMPQ DX, $8
   422  	JGE  fixUpSlowForwardCopy
   423  	MOVQ (R15), BX
   424  	MOVQ BX, (DI)
   425  	SUBQ DX, CX
   426  	ADDQ DX, DI
   427  	ADDQ DX, DX
   428  	JMP  makeOffsetAtLeast8
   429  
   430  fixUpSlowForwardCopy:
   431  	// !!! Add length (which might be negative now) to d (implied by DI being
   432  	// &dst[d]) so that d ends up at the right place when we jump back to the
   433  	// top of the loop. Before we do that, though, we save DI to AX so that, if
   434  	// length is positive, copying the remaining length bytes will write to the
   435  	// right place.
   436  	MOVQ DI, AX
   437  	ADDQ CX, DI
   438  
   439  finishSlowForwardCopy:
   440  	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
   441  	// length means that we overrun, but as above, that will be fixed up by
   442  	// subsequent iterations of the outermost loop.
   443  	CMPQ CX, $0
   444  	JLE  loop
   445  	MOVQ (R15), BX
   446  	MOVQ BX, (AX)
   447  	ADDQ $8, R15
   448  	ADDQ $8, AX
   449  	SUBQ $8, CX
   450  	JMP  finishSlowForwardCopy
   451  
   452  verySlowForwardCopy:
   453  	// verySlowForwardCopy is a simple implementation of forward copy. In C
   454  	// parlance, this is a do/while loop instead of a while loop, since we know
   455  	// that length > 0. In Go syntax:
   456  	//
   457  	// for {
   458  	//   dst[d] = dst[d - offset]
   459  	//   d++
   460  	//   length--
   461  	//   if length == 0 {
   462  	//     break
   463  	//   }
   464  	// }
   465  	MOVB (R15), BX
   466  	MOVB BX, (DI)
   467  	INCQ R15
   468  	INCQ DI
   469  	DECQ CX
   470  	JNZ  verySlowForwardCopy
   471  	JMP  loop
   472  
   473  // The code above handles copy tags.
   474  // ----------------------------------------
   475  
   476  end:
   477  	// This is the end of the "for s < len(src)".
   478  	//
   479  	// if d != len(dst) { etc }
   480  	CMPQ DI, R10
   481  	JNE  errCorrupt
   482  
   483  	// return 0
   484  	MOVQ $0, ret+48(FP)
   485  	RET
   486  
   487  errCorrupt:
   488  	// return decodeErrCodeCorrupt
   489  	MOVQ $1, ret+48(FP)
   490  	RET