github.com/parquet-go/parquet-go@v0.20.0/bloom/xxhash/xxhash_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #define PRIME1 0x9E3779B185EBCA87 6 #define PRIME2 0xC2B2AE3D27D4EB4F 7 #define PRIME3 0x165667B19E3779F9 8 #define PRIME4 0x85EBCA77C2B2AE63 9 #define PRIME5 0x27D4EB2F165667C5 10 11 DATA prime3<>+0(SB)/8, $PRIME3 12 GLOBL prime3<>(SB), RODATA|NOPTR, $8 13 14 DATA prime5<>+0(SB)/8, $PRIME5 15 GLOBL prime5<>(SB), RODATA|NOPTR, $8 16 17 // Register allocation: 18 // AX h 19 // SI pointer to advance through b 20 // DX n 21 // BX loop end 22 // R8 v1, k1 23 // R9 v2 24 // R10 v3 25 // R11 v4 26 // R12 tmp 27 // R13 PRIME1 28 // R14 PRIME2 29 // DI PRIME4 30 31 // round reads from and advances the buffer pointer in SI. 32 // It assumes that R13 has PRIME1 and R14 has PRIME2. 33 #define round(r) \ 34 MOVQ (SI), R12 \ 35 ADDQ $8, SI \ 36 IMULQ R14, R12 \ 37 ADDQ R12, r \ 38 ROLQ $31, r \ 39 IMULQ R13, r 40 41 // mergeRound applies a merge round on the two registers acc and val. 42 // It assumes that R13 has PRIME1, R14 has PRIME2, and DI has PRIME4. 43 #define mergeRound(acc, val) \ 44 IMULQ R14, val \ 45 ROLQ $31, val \ 46 IMULQ R13, val \ 47 XORQ val, acc \ 48 IMULQ R13, acc \ 49 ADDQ DI, acc 50 51 // func Sum64(b []byte) uint64 52 TEXT ·Sum64(SB), NOSPLIT, $0-32 53 // Load fixed primes. 54 MOVQ $PRIME1, R13 55 MOVQ $PRIME2, R14 56 MOVQ $PRIME4, DI 57 58 // Load slice. 59 MOVQ b_base+0(FP), SI 60 MOVQ b_len+8(FP), DX 61 LEAQ (SI)(DX*1), BX 62 63 // The first loop limit will be len(b)-32. 64 SUBQ $32, BX 65 66 // Check whether we have at least one block. 67 CMPQ DX, $32 68 JLT noBlocks 69 70 // Set up initial state (v1, v2, v3, v4). 71 MOVQ R13, R8 72 ADDQ R14, R8 73 MOVQ R14, R9 74 XORQ R10, R10 75 XORQ R11, R11 76 SUBQ R13, R11 77 78 // Loop until SI > BX. 79 blockLoop: 80 round(R8) 81 round(R9) 82 round(R10) 83 round(R11) 84 85 CMPQ SI, BX 86 JLE blockLoop 87 88 MOVQ R8, AX 89 ROLQ $1, AX 90 MOVQ R9, R12 91 ROLQ $7, R12 92 ADDQ R12, AX 93 MOVQ R10, R12 94 ROLQ $12, R12 95 ADDQ R12, AX 96 MOVQ R11, R12 97 ROLQ $18, R12 98 ADDQ R12, AX 99 100 mergeRound(AX, R8) 101 mergeRound(AX, R9) 102 mergeRound(AX, R10) 103 mergeRound(AX, R11) 104 105 JMP afterBlocks 106 107 noBlocks: 108 MOVQ $PRIME5, AX 109 110 afterBlocks: 111 ADDQ DX, AX 112 113 // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. 114 ADDQ $24, BX 115 116 CMPQ SI, BX 117 JG fourByte 118 119 wordLoop: 120 // Calculate k1. 121 MOVQ (SI), R8 122 ADDQ $8, SI 123 IMULQ R14, R8 124 ROLQ $31, R8 125 IMULQ R13, R8 126 127 XORQ R8, AX 128 ROLQ $27, AX 129 IMULQ R13, AX 130 ADDQ DI, AX 131 132 CMPQ SI, BX 133 JLE wordLoop 134 135 fourByte: 136 ADDQ $4, BX 137 CMPQ SI, BX 138 JG singles 139 140 MOVL (SI), R8 141 ADDQ $4, SI 142 IMULQ R13, R8 143 XORQ R8, AX 144 145 ROLQ $23, AX 146 IMULQ R14, AX 147 ADDQ prime3<>(SB), AX 148 149 singles: 150 ADDQ $4, BX 151 CMPQ SI, BX 152 JGE finalize 153 154 singlesLoop: 155 MOVBQZX (SI), R12 156 ADDQ $1, SI 157 IMULQ prime5<>(SB), R12 158 XORQ R12, AX 159 160 ROLQ $11, AX 161 IMULQ R13, AX 162 163 CMPQ SI, BX 164 JL singlesLoop 165 166 finalize: 167 MOVQ AX, R12 168 SHRQ $33, R12 169 XORQ R12, AX 170 IMULQ R14, AX 171 MOVQ AX, R12 172 SHRQ $29, R12 173 XORQ R12, AX 174 IMULQ prime3<>(SB), AX 175 MOVQ AX, R12 176 SHRQ $32, R12 177 XORQ R12, AX 178 179 MOVQ AX, ret+24(FP) 180 RET