github.com/bir3/gocompiler@v0.9.2202/extra/compress/zstd/internal/xxhash/xxhash_arm64.s (about) 1 //go:build !appengine && gc && !purego && !noasm 2 // +build !appengine 3 // +build gc 4 // +build !purego 5 // +build !noasm 6 7 #include "textflag.h" 8 9 // Registers: 10 #define digest R1 11 #define h R2 // return value 12 #define p R3 // input pointer 13 #define n R4 // input length 14 #define nblocks R5 // n / 32 15 #define prime1 R7 16 #define prime2 R8 17 #define prime3 R9 18 #define prime4 R10 19 #define prime5 R11 20 #define v1 R12 21 #define v2 R13 22 #define v3 R14 23 #define v4 R15 24 #define x1 R20 25 #define x2 R21 26 #define x3 R22 27 #define x4 R23 28 29 #define round(acc, x) \ 30 MADD prime2, acc, x, acc \ 31 ROR $64-31, acc \ 32 MUL prime1, acc 33 34 // round0 performs the operation x = round(0, x). 35 #define round0(x) \ 36 MUL prime2, x \ 37 ROR $64-31, x \ 38 MUL prime1, x 39 40 #define mergeRound(acc, x) \ 41 round0(x) \ 42 EOR x, acc \ 43 MADD acc, prime4, prime1, acc 44 45 // blockLoop processes as many 32-byte blocks as possible, 46 // updating v1, v2, v3, and v4. It assumes that n >= 32. 47 #define blockLoop() \ 48 LSR $5, n, nblocks \ 49 PCALIGN $16 \ 50 loop: \ 51 LDP.P 16(p), (x1, x2) \ 52 LDP.P 16(p), (x3, x4) \ 53 round(v1, x1) \ 54 round(v2, x2) \ 55 round(v3, x3) \ 56 round(v4, x4) \ 57 SUB $1, nblocks \ 58 CBNZ nblocks, loop 59 60 // func Sum64(b []byte) uint64 61 TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 62 LDP b_base+0(FP), (p, n) 63 64 LDP ·primes+0(SB), (prime1, prime2) 65 LDP ·primes+16(SB), (prime3, prime4) 66 MOVD ·primes+32(SB), prime5 67 68 CMP $32, n 69 CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 } 70 BLT afterLoop 71 72 ADD prime1, prime2, v1 73 MOVD prime2, v2 74 MOVD $0, v3 75 NEG prime1, v4 76 77 blockLoop() 78 79 ROR $64-1, v1, x1 80 ROR $64-7, v2, x2 81 ADD x1, x2 82 ROR $64-12, v3, x3 83 ROR $64-18, v4, x4 84 ADD x3, x4 85 ADD x2, x4, h 86 87 mergeRound(h, v1) 88 mergeRound(h, v2) 89 mergeRound(h, v3) 90 mergeRound(h, v4) 91 92 afterLoop: 93 ADD n, h 94 95 TBZ $4, n, try8 96 LDP.P 16(p), (x1, x2) 97 98 round0(x1) 99 100 // NOTE: here and below, sequencing the EOR after the ROR (using a 101 // rotated register) is worth a small but measurable speedup for small 102 // inputs. 103 ROR $64-27, h 104 EOR x1 @> 64-27, h, h 105 MADD h, prime4, prime1, h 106 107 round0(x2) 108 ROR $64-27, h 109 EOR x2 @> 64-27, h, h 110 MADD h, prime4, prime1, h 111 112 try8: 113 TBZ $3, n, try4 114 MOVD.P 8(p), x1 115 116 round0(x1) 117 ROR $64-27, h 118 EOR x1 @> 64-27, h, h 119 MADD h, prime4, prime1, h 120 121 try4: 122 TBZ $2, n, try2 123 MOVWU.P 4(p), x2 124 125 MUL prime1, x2 126 ROR $64-23, h 127 EOR x2 @> 64-23, h, h 128 MADD h, prime3, prime2, h 129 130 try2: 131 TBZ $1, n, try1 132 MOVHU.P 2(p), x3 133 AND $255, x3, x1 134 LSR $8, x3, x2 135 136 MUL prime5, x1 137 ROR $64-11, h 138 EOR x1 @> 64-11, h, h 139 MUL prime1, h 140 141 MUL prime5, x2 142 ROR $64-11, h 143 EOR x2 @> 64-11, h, h 144 MUL prime1, h 145 146 try1: 147 TBZ $0, n, finalize 148 MOVBU (p), x4 149 150 MUL prime5, x4 151 ROR $64-11, h 152 EOR x4 @> 64-11, h, h 153 MUL prime1, h 154 155 finalize: 156 EOR h >> 33, h 157 MUL prime2, h 158 EOR h >> 29, h 159 MUL prime3, h 160 EOR h >> 32, h 161 162 MOVD h, ret+24(FP) 163 RET 164 165 // func writeBlocks(d *Digest, b []byte) int 166 TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 167 LDP ·primes+0(SB), (prime1, prime2) 168 169 // Load state. Assume v[1-4] are stored contiguously. 170 MOVD d+0(FP), digest 171 LDP 0(digest), (v1, v2) 172 LDP 16(digest), (v3, v4) 173 174 LDP b_base+8(FP), (p, n) 175 176 blockLoop() 177 178 // Store updated state. 179 STP (v1, v2), 0(digest) 180 STP (v3, v4), 16(digest) 181 182 BIC $31, n 183 MOVD n, ret+32(FP) 184 RET