github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/golang/snappy/decode_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build gc
     6  
     7  #include "textflag.h"
     8  
     9  // func decode(dst, src []byte) int
    10  //
    11  // The asm code generally follows the pure Go code in decode_other.go, except
    12  // where marked with a "!!!".
    13  //
    14  // All local variables fit into registers. The non-zero stack size is only to
    15  // spill registers and push args when issuing a CALL. The register allocation:
    16  //	- AX	scratch
    17  //	- BX	scratch
    18  //	- CX	length or x
    19  //	- DX	offset
    20  //	- SI	&src[s]
    21  //	- DI	&dst[d]
    22  //	+ R8	dst_base
    23  //	+ R9	dst_len
    24  //	+ R10	dst_base + dst_len
    25  //	+ R11	src_base
    26  //	+ R12	src_len
    27  //	+ R13	src_base + src_len
    28  //	- R14	used by doCopy
    29  //	- R15	used by doCopy
    30  //
    31  // The registers R8-R13 (marked with a "+") are set at the start of the
    32  // function, and after a CALL returns, and are not otherwise modified.
    33  //
    34  // The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
    35  // The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
    36  TEXT ·decode(SB), NOSPLIT, $48-56
    37  	// Initialize SI, DI and R8-R13.
    38  	MOVQ dst_base+0(FP), R8
    39  	MOVQ dst_len+8(FP), R9
    40  	MOVQ R8, DI
    41  	MOVQ R8, R10
    42  	ADDQ R9, R10
    43  	MOVQ src_base+24(FP), R11
    44  	MOVQ src_len+32(FP), R12
    45  	MOVQ R11, SI
    46  	MOVQ R11, R13
    47  	ADDQ R12, R13
    48  
    49  loop:
    50  	// for s < len(src)
    51  	CMPQ SI, R13
    52  	JEQ  end
    53  
    54  	// CX = uint32(src[s])
    55  	//
    56  	// switch src[s] & 0x03
    57  	MOVBLZX (SI), CX
    58  	MOVL    CX, BX
    59  	ANDL    $3, BX
    60  	CMPL    BX, $1
    61  	JAE     tagCopy
    62  
    63  	// ----------------------------------------
    64  	// The code below handles literal tags.
    65  
    66  	// case tagLiteral:
    67  	// x := uint32(src[s] >> 2)
    68  	// switch
    69  	SHRL $2, CX
    70  	CMPL CX, $60
    71  	JAE  tagLit60Plus
    72  
    73  	// case x < 60:
    74  	// s++
    75  	INCQ SI
    76  
    77  doLit:
    78  	// This is the end of the inner "switch", when we have a literal tag.
    79  	//
    80  	// We assume that CX == x and x fits in a uint32, where x is the variable
    81  	// used in the pure Go decode_other.go code.
    82  
    83  	// length = int(x) + 1
    84  	//
    85  	// Unlike the pure Go code, we don't need to check if length <= 0 because
    86  	// CX can hold 64 bits, so the increment cannot overflow.
    87  	INCQ CX
    88  
    89  	// Prepare to check if copying length bytes will run past the end of dst or
    90  	// src.
    91  	//
    92  	// AX = len(dst) - d
    93  	// BX = len(src) - s
    94  	MOVQ R10, AX
    95  	SUBQ DI, AX
    96  	MOVQ R13, BX
    97  	SUBQ SI, BX
    98  
    99  	// !!! Try a faster technique for short (16 or fewer bytes) copies.
   100  	//
   101  	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
   102  	//   goto callMemmove // Fall back on calling runtime·memmove.
   103  	// }
   104  	//
   105  	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
   106  	// against 21 instead of 16, because it cannot assume that all of its input
   107  	// is contiguous in memory and so it needs to leave enough source bytes to
   108  	// read the next tag without refilling buffers, but Go's Decode assumes
   109  	// contiguousness (the src argument is a []byte).
   110  	CMPQ CX, $16
   111  	JGT  callMemmove
   112  	CMPQ AX, $16
   113  	JLT  callMemmove
   114  	CMPQ BX, $16
   115  	JLT  callMemmove
   116  
   117  	// !!! Implement the copy from src to dst as a 16-byte load and store.
   118  	// (Decode's documentation says that dst and src must not overlap.)
   119  	//
   120  	// This always copies 16 bytes, instead of only length bytes, but that's
   121  	// OK. If the input is a valid Snappy encoding then subsequent iterations
   122  	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
   123  	// non-nil error), so the overrun will be ignored.
   124  	//
   125  	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
   126  	// 16-byte loads and stores. This technique probably wouldn't be as
   127  	// effective on architectures that are fussier about alignment.
   128  	MOVOU 0(SI), X0
   129  	MOVOU X0, 0(DI)
   130  
   131  	// d += length
   132  	// s += length
   133  	ADDQ CX, DI
   134  	ADDQ CX, SI
   135  	JMP  loop
   136  
   137  callMemmove:
   138  	// if length > len(dst)-d || length > len(src)-s { etc }
   139  	CMPQ CX, AX
   140  	JGT  errCorrupt
   141  	CMPQ CX, BX
   142  	JGT  errCorrupt
   143  
   144  	// copy(dst[d:], src[s:s+length])
   145  	//
   146  	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
   147  	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
   148  	// three registers to the stack, to save local variables across the CALL.
   149  	MOVQ DI, 0(SP)
   150  	MOVQ SI, 8(SP)
   151  	MOVQ CX, 16(SP)
   152  	MOVQ DI, 24(SP)
   153  	MOVQ SI, 32(SP)
   154  	MOVQ CX, 40(SP)
   155  	CALL runtime·memmove(SB)
   156  
   157  	// Restore local variables: unspill registers from the stack and
   158  	// re-calculate R8-R13.
   159  	MOVQ 24(SP), DI
   160  	MOVQ 32(SP), SI
   161  	MOVQ 40(SP), CX
   162  	MOVQ dst_base+0(FP), R8
   163  	MOVQ dst_len+8(FP), R9
   164  	MOVQ R8, R10
   165  	ADDQ R9, R10
   166  	MOVQ src_base+24(FP), R11
   167  	MOVQ src_len+32(FP), R12
   168  	MOVQ R11, R13
   169  	ADDQ R12, R13
   170  
   171  	// d += length
   172  	// s += length
   173  	ADDQ CX, DI
   174  	ADDQ CX, SI
   175  	JMP  loop
   176  
   177  tagLit60Plus:
   178  	// !!! This fragment does the
   179  	//
   180  	// s += x - 58; if uint(s) > uint(len(src)) { etc }
   181  	//
   182  	// checks. In the asm version, we code it once instead of once per switch case.
   183  	ADDQ CX, SI
   184  	SUBQ $58, SI
   185  	MOVQ SI, BX
   186  	SUBQ R11, BX
   187  	CMPQ BX, R12
   188  	JA   errCorrupt
   189  
   190  	// case x == 60:
   191  	CMPL CX, $61
   192  	JEQ  tagLit61
   193  	JA   tagLit62Plus
   194  
   195  	// x = uint32(src[s-1])
   196  	MOVBLZX -1(SI), CX
   197  	JMP     doLit
   198  
   199  tagLit61:
   200  	// case x == 61:
   201  	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
   202  	MOVWLZX -2(SI), CX
   203  	JMP     doLit
   204  
   205  tagLit62Plus:
   206  	CMPL CX, $62
   207  	JA   tagLit63
   208  
   209  	// case x == 62:
   210  	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
   211  	MOVWLZX -3(SI), CX
   212  	MOVBLZX -1(SI), BX
   213  	SHLL    $16, BX
   214  	ORL     BX, CX
   215  	JMP     doLit
   216  
   217  tagLit63:
   218  	// case x == 63:
   219  	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
   220  	MOVL -4(SI), CX
   221  	JMP  doLit
   222  
   223  // The code above handles literal tags.
   224  // ----------------------------------------
   225  // The code below handles copy tags.
   226  
   227  tagCopy2:
   228  	// case tagCopy2:
   229  	// s += 3
   230  	ADDQ $3, SI
   231  
   232  	// if uint(s) > uint(len(src)) { etc }
   233  	MOVQ SI, BX
   234  	SUBQ R11, BX
   235  	CMPQ BX, R12
   236  	JA   errCorrupt
   237  
   238  	// length = 1 + int(src[s-3])>>2
   239  	SHRQ $2, CX
   240  	INCQ CX
   241  
   242  	// offset = int(src[s-2]) | int(src[s-1])<<8
   243  	MOVWQZX -2(SI), DX
   244  	JMP     doCopy
   245  
   246  tagCopy:
   247  	// We have a copy tag. We assume that:
   248  	//	- BX == src[s] & 0x03
   249  	//	- CX == src[s]
   250  	CMPQ BX, $2
   251  	JEQ  tagCopy2
   252  	JA   errUC4T
   253  
   254  	// case tagCopy1:
   255  	// s += 2
   256  	ADDQ $2, SI
   257  
   258  	// if uint(s) > uint(len(src)) { etc }
   259  	MOVQ SI, BX
   260  	SUBQ R11, BX
   261  	CMPQ BX, R12
   262  	JA   errCorrupt
   263  
   264  	// offset = int(src[s-2])&0xe0<<3 | int(src[s-1])
   265  	MOVQ    CX, DX
   266  	ANDQ    $0xe0, DX
   267  	SHLQ    $3, DX
   268  	MOVBQZX -1(SI), BX
   269  	ORQ     BX, DX
   270  
   271  	// length = 4 + int(src[s-2])>>2&0x7
   272  	SHRQ $2, CX
   273  	ANDQ $7, CX
   274  	ADDQ $4, CX
   275  
   276  doCopy:
   277  	// This is the end of the outer "switch", when we have a copy tag.
   278  	//
   279  	// We assume that:
   280  	//	- CX == length && CX > 0
   281  	//	- DX == offset
   282  
   283  	// if offset <= 0 { etc }
   284  	CMPQ DX, $0
   285  	JLE  errCorrupt
   286  
   287  	// if d < offset { etc }
   288  	MOVQ DI, BX
   289  	SUBQ R8, BX
   290  	CMPQ BX, DX
   291  	JLT  errCorrupt
   292  
   293  	// if length > len(dst)-d { etc }
   294  	MOVQ R10, BX
   295  	SUBQ DI, BX
   296  	CMPQ CX, BX
   297  	JGT  errCorrupt
   298  
   299  	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
   300  	//
   301  	// Set:
   302  	//	- R14 = len(dst)-d
   303  	//	- R15 = &dst[d-offset]
   304  	MOVQ R10, R14
   305  	SUBQ DI, R14
   306  	MOVQ DI, R15
   307  	SUBQ DX, R15
   308  
   309  	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
   310  	//
   311  	// First, try using two 8-byte load/stores, similar to the doLit technique
   312  	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
   313  	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
   314  	// and not one 16-byte load/store, and the first store has to be before the
   315  	// second load, due to the overlap if offset is in the range [8, 16).
   316  	//
   317  	// if length > 16 || offset < 8 || len(dst)-d < 16 {
   318  	//   goto slowForwardCopy
   319  	// }
   320  	// copy 16 bytes
   321  	// d += length
   322  	CMPQ CX, $16
   323  	JGT  slowForwardCopy
   324  	CMPQ DX, $8
   325  	JLT  slowForwardCopy
   326  	CMPQ R14, $16
   327  	JLT  slowForwardCopy
   328  	MOVQ 0(R15), AX
   329  	MOVQ AX, 0(DI)
   330  	MOVQ 8(R15), BX
   331  	MOVQ BX, 8(DI)
   332  	ADDQ CX, DI
   333  	JMP  loop
   334  
   335  slowForwardCopy:
   336  	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
   337  	// can still try 8-byte load stores, provided we can overrun up to 10 extra
   338  	// bytes. As above, the overrun will be fixed up by subsequent iterations
   339  	// of the outermost loop.
   340  	//
   341  	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
   342  	// commentary says:
   343  	//
   344  	// ----
   345  	//
   346  	// The main part of this loop is a simple copy of eight bytes at a time
   347  	// until we've copied (at least) the requested amount of bytes.  However,
   348  	// if d and d-offset are less than eight bytes apart (indicating a
   349  	// repeating pattern of length < 8), we first need to expand the pattern in
   350  	// order to get the correct results. For instance, if the buffer looks like
   351  	// this, with the eight-byte <d-offset> and <d> patterns marked as
   352  	// intervals:
   353  	//
   354  	//    abxxxxxxxxxxxx
   355  	//    [------]           d-offset
   356  	//      [------]         d
   357  	//
   358  	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
   359  	// once, after which we can move <d> two bytes without moving <d-offset>:
   360  	//
   361  	//    ababxxxxxxxxxx
   362  	//    [------]           d-offset
   363  	//        [------]       d
   364  	//
   365  	// and repeat the exercise until the two no longer overlap.
   366  	//
   367  	// This allows us to do very well in the special case of one single byte
   368  	// repeated many times, without taking a big hit for more general cases.
   369  	//
   370  	// The worst case of extra writing past the end of the match occurs when
   371  	// offset == 1 and length == 1; the last copy will read from byte positions
   372  	// [0..7] and write to [4..11], whereas it was only supposed to write to
   373  	// position 1. Thus, ten excess bytes.
   374  	//
   375  	// ----
   376  	//
   377  	// That "10 byte overrun" worst case is confirmed by Go's
   378  	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
   379  	// and finishSlowForwardCopy algorithm.
   380  	//
   381  	// if length > len(dst)-d-10 {
   382  	//   goto verySlowForwardCopy
   383  	// }
   384  	SUBQ $10, R14
   385  	CMPQ CX, R14
   386  	JGT  verySlowForwardCopy
   387  
   388  makeOffsetAtLeast8:
   389  	// !!! As above, expand the pattern so that offset >= 8 and we can use
   390  	// 8-byte load/stores.
   391  	//
   392  	// for offset < 8 {
   393  	//   copy 8 bytes from dst[d-offset:] to dst[d:]
   394  	//   length -= offset
   395  	//   d      += offset
   396  	//   offset += offset
   397  	//   // The two previous lines together means that d-offset, and therefore
   398  	//   // R15, is unchanged.
   399  	// }
   400  	CMPQ DX, $8
   401  	JGE  fixUpSlowForwardCopy
   402  	MOVQ (R15), BX
   403  	MOVQ BX, (DI)
   404  	SUBQ DX, CX
   405  	ADDQ DX, DI
   406  	ADDQ DX, DX
   407  	JMP  makeOffsetAtLeast8
   408  
   409  fixUpSlowForwardCopy:
   410  	// !!! Add length (which might be negative now) to d (implied by DI being
   411  	// &dst[d]) so that d ends up at the right place when we jump back to the
   412  	// top of the loop. Before we do that, though, we save DI to AX so that, if
   413  	// length is positive, copying the remaining length bytes will write to the
   414  	// right place.
   415  	MOVQ DI, AX
   416  	ADDQ CX, DI
   417  
   418  finishSlowForwardCopy:
   419  	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
   420  	// length means that we overrun, but as above, that will be fixed up by
   421  	// subsequent iterations of the outermost loop.
   422  	CMPQ CX, $0
   423  	JLE  loop
   424  	MOVQ (R15), BX
   425  	MOVQ BX, (AX)
   426  	ADDQ $8, R15
   427  	ADDQ $8, AX
   428  	SUBQ $8, CX
   429  	JMP  finishSlowForwardCopy
   430  
   431  verySlowForwardCopy:
   432  	// verySlowForwardCopy is a simple implementation of forward copy. In C
   433  	// parlance, this is a do/while loop instead of a while loop, since we know
   434  	// that length > 0. In Go syntax:
   435  	//
   436  	// for {
   437  	//   dst[d] = dst[d - offset]
   438  	//   d++
   439  	//   length--
   440  	//   if length == 0 {
   441  	//     break
   442  	//   }
   443  	// }
   444  	MOVB (R15), BX
   445  	MOVB BX, (DI)
   446  	INCQ R15
   447  	INCQ DI
   448  	DECQ CX
   449  	JNZ  verySlowForwardCopy
   450  	JMP  loop
   451  
   452  // The code above handles copy tags.
   453  // ----------------------------------------
   454  
   455  end:
   456  	// This is the end of the "for s < len(src)".
   457  	//
   458  	// if d != len(dst) { etc }
   459  	CMPQ DI, R10
   460  	JNE  errCorrupt
   461  
   462  	// return 0
   463  	MOVQ $0, ret+48(FP)
   464  	RET
   465  
   466  errCorrupt:
   467  	// return decodeErrCodeCorrupt
   468  	MOVQ $1, ret+48(FP)
   469  	RET
   470  
   471  errUC4T:
   472  	// return decodeErrCodeUnsupportedCopy4Tag
   473  	MOVQ $3, ret+48(FP)
   474  	RET