github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/klauspost/crc32/crc32_amd64.s (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build gc 6 7 #define NOSPLIT 4 8 #define RODATA 8 9 10 // func castagnoliSSE42(crc uint32, p []byte) uint32 11 TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 12 MOVL crc+0(FP), AX // CRC value 13 MOVQ p+8(FP), SI // data pointer 14 MOVQ p_len+16(FP), CX // len(p) 15 16 NOTL AX 17 18 // If there's less than 8 bytes to process, we do it byte-by-byte. 19 CMPQ CX, $8 20 JL cleanup 21 22 // Process individual bytes until the input is 8-byte aligned. 23 startup: 24 MOVQ SI, BX 25 ANDQ $7, BX 26 JZ aligned 27 28 CRC32B (SI), AX 29 DECQ CX 30 INCQ SI 31 JMP startup 32 33 aligned: 34 // The input is now 8-byte aligned and we can process 8-byte chunks. 35 CMPQ CX, $8 36 JL cleanup 37 38 CRC32Q (SI), AX 39 ADDQ $8, SI 40 SUBQ $8, CX 41 JMP aligned 42 43 cleanup: 44 // We may have some bytes left over that we process one at a time. 45 CMPQ CX, $0 46 JE done 47 48 CRC32B (SI), AX 49 INCQ SI 50 DECQ CX 51 JMP cleanup 52 53 done: 54 NOTL AX 55 MOVL AX, ret+32(FP) 56 RET 57 58 // func haveSSE42() bool 59 TEXT ·haveSSE42(SB), NOSPLIT, $0 60 XORQ AX, AX 61 INCL AX 62 CPUID 63 SHRQ $20, CX 64 ANDQ $1, CX 65 MOVB CX, ret+0(FP) 66 RET 67 68 // func haveCLMUL() bool 69 TEXT ·haveCLMUL(SB), NOSPLIT, $0 70 XORQ AX, AX 71 INCL AX 72 CPUID 73 SHRQ $1, CX 74 ANDQ $1, CX 75 MOVB CX, ret+0(FP) 76 RET 77 78 // func haveSSE41() bool 79 TEXT ·haveSSE41(SB), NOSPLIT, $0 80 XORQ AX, AX 81 INCL AX 82 CPUID 83 SHRQ $19, CX 84 ANDQ $1, CX 85 MOVB CX, ret+0(FP) 86 RET 87 88 // CRC32 polynomial data 89 // 90 // These constants are lifted from the 91 // Linux kernel, since they avoid the costly 92 // PSHUFB 16 byte reversal proposed in the 93 // original Intel paper. 94 DATA r2r1kp<>+0(SB)/8, $0x154442bd4 95 DATA r2r1kp<>+8(SB)/8, $0x1c6e41596 96 DATA r4r3kp<>+0(SB)/8, $0x1751997d0 97 DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e 98 DATA rupolykp<>+0(SB)/8, $0x1db710641 99 DATA rupolykp<>+8(SB)/8, $0x1f7011641 100 DATA r5kp<>+0(SB)/8, $0x163cd6124 101 102 GLOBL r2r1kp<>(SB), RODATA, $16 103 GLOBL r4r3kp<>(SB), RODATA, $16 104 GLOBL rupolykp<>(SB), RODATA, $16 105 GLOBL r5kp<>(SB), RODATA, $8 106 107 // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 108 // len(p) must be at least 64, and must be a multiple of 16. 109 110 // func ieeeCLMUL(crc uint32, p []byte) uint32 111 TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 112 MOVL crc+0(FP), X0 // Initial CRC value 113 MOVQ p+8(FP), SI // data pointer 114 MOVQ p_len+16(FP), CX // len(p) 115 116 MOVOU (SI), X1 117 MOVOU 16(SI), X2 118 MOVOU 32(SI), X3 119 MOVOU 48(SI), X4 120 PXOR X0, X1 121 ADDQ $64, SI // buf+=64 122 SUBQ $64, CX // len-=64 123 CMPQ CX, $64 // Less than 64 bytes left 124 JB remain64 125 126 MOVOA r2r1kp<>+0(SB), X0 127 128 loopback64: 129 MOVOA X1, X5 130 MOVOA X2, X6 131 MOVOA X3, X7 132 MOVOA X4, X8 133 134 PCLMULQDQ $0, X0, X1 135 PCLMULQDQ $0, X0, X2 136 PCLMULQDQ $0, X0, X3 137 PCLMULQDQ $0, X0, X4 138 139 // Load next early 140 MOVOU (SI), X11 141 MOVOU 16(SI), X12 142 MOVOU 32(SI), X13 143 MOVOU 48(SI), X14 144 145 PCLMULQDQ $0x11, X0, X5 146 PCLMULQDQ $0x11, X0, X6 147 PCLMULQDQ $0x11, X0, X7 148 PCLMULQDQ $0x11, X0, X8 149 150 PXOR X5, X1 151 PXOR X6, X2 152 PXOR X7, X3 153 PXOR X8, X4 154 155 PXOR X11, X1 156 PXOR X12, X2 157 PXOR X13, X3 158 PXOR X14, X4 159 160 ADDQ $0x40, DI 161 ADDQ $64, SI // buf+=64 162 SUBQ $64, CX // len-=64 163 CMPQ CX, $64 // Less than 64 bytes left? 164 JGE loopback64 165 166 // Fold result into a single register (X1) 167 remain64: 168 MOVOA r4r3kp<>+0(SB), X0 169 170 MOVOA X1, X5 171 PCLMULQDQ $0, X0, X1 172 PCLMULQDQ $0x11, X0, X5 173 PXOR X5, X1 174 PXOR X2, X1 175 176 MOVOA X1, X5 177 PCLMULQDQ $0, X0, X1 178 PCLMULQDQ $0x11, X0, X5 179 PXOR X5, X1 180 PXOR X3, X1 181 182 MOVOA X1, X5 183 PCLMULQDQ $0, X0, X1 184 PCLMULQDQ $0x11, X0, X5 185 PXOR X5, X1 186 PXOR X4, X1 187 188 // More than 16 bytes left? 189 CMPQ CX, $16 190 JB finish 191 192 // Encode 16 bytes 193 remain16: 194 MOVOU (SI), X10 195 MOVOA X1, X5 196 PCLMULQDQ $0, X0, X1 197 PCLMULQDQ $0x11, X0, X5 198 PXOR X5, X1 199 PXOR X10, X1 200 SUBQ $16, CX 201 ADDQ $16, SI 202 CMPQ CX, $16 203 JGE remain16 204 205 finish: 206 // Fold final result into 32 bits and return it 207 PCMPEQB X3, X3 208 PCLMULQDQ $1, X1, X0 209 PSRLDQ $8, X1 210 PXOR X0, X1 211 212 MOVOA X1, X2 213 MOVQ r5kp<>+0(SB), X0 214 215 // Creates 32 bit mask. Note that we don't care about upper half. 216 PSRLQ $32, X3 217 218 PSRLDQ $4, X2 219 PAND X3, X1 220 PCLMULQDQ $0, X0, X1 221 PXOR X2, X1 222 223 MOVOA rupolykp<>+0(SB), X0 224 225 MOVOA X1, X2 226 PAND X3, X1 227 PCLMULQDQ $0x10, X0, X1 228 PAND X3, X1 229 PCLMULQDQ $0, X0, X1 230 PXOR X2, X1 231 232 // PEXTRD $1, X1, AX (SSE 4.1) 233 BYTE $0x66; BYTE $0x0f; BYTE $0x3a 234 BYTE $0x16; BYTE $0xc8; BYTE $0x01 235 MOVL AX, ret+32(FP) 236 237 RET