github.com/bir3/gocompiler@v0.9.2202/extra/compress/zstd/internal/xxhash/xxhash_amd64.s (about)

     1  //go:build !appengine && gc && !purego && !noasm
     2  // +build !appengine
     3  // +build gc
     4  // +build !purego
     5  // +build !noasm
     6  
     7  #include "textflag.h"
     8  
     9  // Registers:
    10  #define h      AX
    11  #define d      AX
    12  #define p      SI // pointer to advance through b
    13  #define n      DX
    14  #define end    BX // loop end
    15  #define v1     R8
    16  #define v2     R9
    17  #define v3     R10
    18  #define v4     R11
    19  #define x      R12
    20  #define prime1 R13
    21  #define prime2 R14
    22  #define prime4 DI
    23  
    24  #define round(acc, x) \
    25  	IMULQ prime2, x   \
    26  	ADDQ  x, acc      \
    27  	ROLQ  $31, acc    \
    28  	IMULQ prime1, acc
    29  
    30  // round0 performs the operation x = round(0, x).
    31  #define round0(x) \
    32  	IMULQ prime2, x \
    33  	ROLQ  $31, x    \
    34  	IMULQ prime1, x
    35  
    36  // mergeRound applies a merge round on the two registers acc and x.
    37  // It assumes that prime1, prime2, and prime4 have been loaded.
    38  #define mergeRound(acc, x) \
    39  	round0(x)         \
    40  	XORQ  x, acc      \
    41  	IMULQ prime1, acc \
    42  	ADDQ  prime4, acc
    43  
    44  // blockLoop processes as many 32-byte blocks as possible,
    45  // updating v1, v2, v3, and v4. It assumes that there is at least one block
    46  // to process.
    47  #define blockLoop() \
    48  loop:  \
    49  	MOVQ +0(p), x  \
    50  	round(v1, x)   \
    51  	MOVQ +8(p), x  \
    52  	round(v2, x)   \
    53  	MOVQ +16(p), x \
    54  	round(v3, x)   \
    55  	MOVQ +24(p), x \
    56  	round(v4, x)   \
    57  	ADDQ $32, p    \
    58  	CMPQ p, end    \
    59  	JLE  loop
    60  
    61  // func Sum64(b []byte) uint64
    62  TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
    63  	// Load fixed primes.
    64  	MOVQ ·primes+0(SB), prime1
    65  	MOVQ ·primes+8(SB), prime2
    66  	MOVQ ·primes+24(SB), prime4
    67  
    68  	// Load slice.
    69  	MOVQ b_base+0(FP), p
    70  	MOVQ b_len+8(FP), n
    71  	LEAQ (p)(n*1), end
    72  
    73  	// The first loop limit will be len(b)-32.
    74  	SUBQ $32, end
    75  
    76  	// Check whether we have at least one block.
    77  	CMPQ n, $32
    78  	JLT  noBlocks
    79  
    80  	// Set up initial state (v1, v2, v3, v4).
    81  	MOVQ prime1, v1
    82  	ADDQ prime2, v1
    83  	MOVQ prime2, v2
    84  	XORQ v3, v3
    85  	XORQ v4, v4
    86  	SUBQ prime1, v4
    87  
    88  	blockLoop()
    89  
    90  	MOVQ v1, h
    91  	ROLQ $1, h
    92  	MOVQ v2, x
    93  	ROLQ $7, x
    94  	ADDQ x, h
    95  	MOVQ v3, x
    96  	ROLQ $12, x
    97  	ADDQ x, h
    98  	MOVQ v4, x
    99  	ROLQ $18, x
   100  	ADDQ x, h
   101  
   102  	mergeRound(h, v1)
   103  	mergeRound(h, v2)
   104  	mergeRound(h, v3)
   105  	mergeRound(h, v4)
   106  
   107  	JMP afterBlocks
   108  
   109  noBlocks:
   110  	MOVQ ·primes+32(SB), h
   111  
   112  afterBlocks:
   113  	ADDQ n, h
   114  
   115  	ADDQ $24, end
   116  	CMPQ p, end
   117  	JG   try4
   118  
   119  loop8:
   120  	MOVQ  (p), x
   121  	ADDQ  $8, p
   122  	round0(x)
   123  	XORQ  x, h
   124  	ROLQ  $27, h
   125  	IMULQ prime1, h
   126  	ADDQ  prime4, h
   127  
   128  	CMPQ p, end
   129  	JLE  loop8
   130  
   131  try4:
   132  	ADDQ $4, end
   133  	CMPQ p, end
   134  	JG   try1
   135  
   136  	MOVL  (p), x
   137  	ADDQ  $4, p
   138  	IMULQ prime1, x
   139  	XORQ  x, h
   140  
   141  	ROLQ  $23, h
   142  	IMULQ prime2, h
   143  	ADDQ  ·primes+16(SB), h
   144  
   145  try1:
   146  	ADDQ $4, end
   147  	CMPQ p, end
   148  	JGE  finalize
   149  
   150  loop1:
   151  	MOVBQZX (p), x
   152  	ADDQ    $1, p
   153  	IMULQ   ·primes+32(SB), x
   154  	XORQ    x, h
   155  	ROLQ    $11, h
   156  	IMULQ   prime1, h
   157  
   158  	CMPQ p, end
   159  	JL   loop1
   160  
   161  finalize:
   162  	MOVQ  h, x
   163  	SHRQ  $33, x
   164  	XORQ  x, h
   165  	IMULQ prime2, h
   166  	MOVQ  h, x
   167  	SHRQ  $29, x
   168  	XORQ  x, h
   169  	IMULQ ·primes+16(SB), h
   170  	MOVQ  h, x
   171  	SHRQ  $32, x
   172  	XORQ  x, h
   173  
   174  	MOVQ h, ret+24(FP)
   175  	RET
   176  
   177  // func writeBlocks(d *Digest, b []byte) int
   178  TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
   179  	// Load fixed primes needed for round.
   180  	MOVQ ·primes+0(SB), prime1
   181  	MOVQ ·primes+8(SB), prime2
   182  
   183  	// Load slice.
   184  	MOVQ b_base+8(FP), p
   185  	MOVQ b_len+16(FP), n
   186  	LEAQ (p)(n*1), end
   187  	SUBQ $32, end
   188  
   189  	// Load vN from d.
   190  	MOVQ s+0(FP), d
   191  	MOVQ 0(d), v1
   192  	MOVQ 8(d), v2
   193  	MOVQ 16(d), v3
   194  	MOVQ 24(d), v4
   195  
   196  	// We don't need to check the loop condition here; this function is
   197  	// always called with at least one block of data to process.
   198  	blockLoop()
   199  
   200  	// Copy vN back to d.
   201  	MOVQ v1, 0(d)
   202  	MOVQ v2, 8(d)
   203  	MOVQ v3, 16(d)
   204  	MOVQ v4, 24(d)
   205  
   206  	// The number of bytes written is p minus the old base pointer.
   207  	SUBQ b_base+8(FP), p
   208  	MOVQ p, ret+32(FP)
   209  
   210  	RET