github.com/rakyll/go@v0.0.0-20170216000551-64c02460d703/src/hash/crc32/crc32_amd64.s (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer. 8 // 9 // func castagnoliSSE42(crc uint32, p []byte) uint32 10 TEXT ·castagnoliSSE42(SB),NOSPLIT,$0 11 MOVL crc+0(FP), AX // CRC value 12 MOVQ p+8(FP), SI // data pointer 13 MOVQ p_len+16(FP), CX // len(p) 14 15 // If there are fewer than 8 bytes to process, skip alignment. 16 CMPQ CX, $8 17 JL less_than_8 18 19 MOVQ SI, BX 20 ANDQ $7, BX 21 JZ aligned 22 23 // Process the first few bytes to 8-byte align the input. 24 25 // BX = 8 - BX. We need to process this many bytes to align. 26 SUBQ $1, BX 27 XORQ $7, BX 28 29 BTQ $0, BX 30 JNC align_2 31 32 CRC32B (SI), AX 33 DECQ CX 34 INCQ SI 35 36 align_2: 37 BTQ $1, BX 38 JNC align_4 39 40 // CRC32W (SI), AX 41 BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 42 43 SUBQ $2, CX 44 ADDQ $2, SI 45 46 align_4: 47 BTQ $2, BX 48 JNC aligned 49 50 // CRC32L (SI), AX 51 BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 52 53 SUBQ $4, CX 54 ADDQ $4, SI 55 56 aligned: 57 // The input is now 8-byte aligned and we can process 8-byte chunks. 58 CMPQ CX, $8 59 JL less_than_8 60 61 CRC32Q (SI), AX 62 ADDQ $8, SI 63 SUBQ $8, CX 64 JMP aligned 65 66 less_than_8: 67 // We may have some bytes left over; process 4 bytes, then 2, then 1. 68 BTQ $2, CX 69 JNC less_than_4 70 71 // CRC32L (SI), AX 72 BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 73 ADDQ $4, SI 74 75 less_than_4: 76 BTQ $1, CX 77 JNC less_than_2 78 79 // CRC32W (SI), AX 80 BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 81 ADDQ $2, SI 82 83 less_than_2: 84 BTQ $0, CX 85 JNC done 86 87 CRC32B (SI), AX 88 89 done: 90 MOVL AX, ret+32(FP) 91 RET 92 93 // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) 94 // bytes from each buffer. 95 // 96 // func castagnoliSSE42Triple( 97 // crc1, crc2, crc3 uint32, 98 // a, b, c []byte, 99 // rounds uint32, 100 // ) (retA uint32, retB uint32, retC uint32) 101 TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0 102 MOVL crcA+0(FP), AX 103 MOVL crcB+4(FP), CX 104 MOVL crcC+8(FP), DX 105 106 MOVQ a+16(FP), R8 // data pointer 107 MOVQ b+40(FP), R9 // data pointer 108 MOVQ c+64(FP), R10 // data pointer 109 110 MOVL rounds+88(FP), R11 111 112 loop: 113 CRC32Q (R8), AX 114 CRC32Q (R9), CX 115 CRC32Q (R10), DX 116 117 CRC32Q 8(R8), AX 118 CRC32Q 8(R9), CX 119 CRC32Q 8(R10), DX 120 121 CRC32Q 16(R8), AX 122 CRC32Q 16(R9), CX 123 CRC32Q 16(R10), DX 124 125 ADDQ $24, R8 126 ADDQ $24, R9 127 ADDQ $24, R10 128 129 DECQ R11 130 JNZ loop 131 132 MOVL AX, retA+96(FP) 133 MOVL CX, retB+100(FP) 134 MOVL DX, retC+104(FP) 135 RET 136 137 // func haveSSE42() bool 138 TEXT ·haveSSE42(SB),NOSPLIT,$0 139 XORQ AX, AX 140 INCL AX 141 CPUID 142 SHRQ $20, CX 143 ANDQ $1, CX 144 MOVB CX, ret+0(FP) 145 RET 146 147 // func haveCLMUL() bool 148 TEXT ·haveCLMUL(SB),NOSPLIT,$0 149 XORQ AX, AX 150 INCL AX 151 CPUID 152 SHRQ $1, CX 153 ANDQ $1, CX 154 MOVB CX, ret+0(FP) 155 RET 156 157 // func haveSSE41() bool 158 TEXT ·haveSSE41(SB),NOSPLIT,$0 159 XORQ AX, AX 160 INCL AX 161 CPUID 162 SHRQ $19, CX 163 ANDQ $1, CX 164 MOVB CX, ret+0(FP) 165 RET 166 167 // CRC32 polynomial data 168 // 169 // These constants are lifted from the 170 // Linux kernel, since they avoid the costly 171 // PSHUFB 16 byte reversal proposed in the 172 // original Intel paper. 173 DATA r2r1<>+0(SB)/8, $0x154442bd4 174 DATA r2r1<>+8(SB)/8, $0x1c6e41596 175 DATA r4r3<>+0(SB)/8, $0x1751997d0 176 DATA r4r3<>+8(SB)/8, $0x0ccaa009e 177 DATA rupoly<>+0(SB)/8, $0x1db710641 178 DATA rupoly<>+8(SB)/8, $0x1f7011641 179 DATA r5<>+0(SB)/8, $0x163cd6124 180 181 GLOBL r2r1<>(SB),RODATA,$16 182 GLOBL r4r3<>(SB),RODATA,$16 183 GLOBL rupoly<>(SB),RODATA,$16 184 GLOBL r5<>(SB),RODATA,$8 185 186 // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 187 // len(p) must be at least 64, and must be a multiple of 16. 188 189 // func ieeeCLMUL(crc uint32, p []byte) uint32 190 TEXT ·ieeeCLMUL(SB),NOSPLIT,$0 191 MOVL crc+0(FP), X0 // Initial CRC value 192 MOVQ p+8(FP), SI // data pointer 193 MOVQ p_len+16(FP), CX // len(p) 194 195 MOVOU (SI), X1 196 MOVOU 16(SI), X2 197 MOVOU 32(SI), X3 198 MOVOU 48(SI), X4 199 PXOR X0, X1 200 ADDQ $64, SI // buf+=64 201 SUBQ $64, CX // len-=64 202 CMPQ CX, $64 // Less than 64 bytes left 203 JB remain64 204 205 MOVOA r2r1<>+0(SB), X0 206 loopback64: 207 MOVOA X1, X5 208 MOVOA X2, X6 209 MOVOA X3, X7 210 MOVOA X4, X8 211 212 PCLMULQDQ $0, X0, X1 213 PCLMULQDQ $0, X0, X2 214 PCLMULQDQ $0, X0, X3 215 PCLMULQDQ $0, X0, X4 216 217 /* Load next early */ 218 MOVOU (SI), X11 219 MOVOU 16(SI), X12 220 MOVOU 32(SI), X13 221 MOVOU 48(SI), X14 222 223 PCLMULQDQ $0x11, X0, X5 224 PCLMULQDQ $0x11, X0, X6 225 PCLMULQDQ $0x11, X0, X7 226 PCLMULQDQ $0x11, X0, X8 227 228 PXOR X5, X1 229 PXOR X6, X2 230 PXOR X7, X3 231 PXOR X8, X4 232 233 PXOR X11, X1 234 PXOR X12, X2 235 PXOR X13, X3 236 PXOR X14, X4 237 238 ADDQ $0x40, DI 239 ADDQ $64, SI // buf+=64 240 SUBQ $64, CX // len-=64 241 CMPQ CX, $64 // Less than 64 bytes left? 242 JGE loopback64 243 244 /* Fold result into a single register (X1) */ 245 remain64: 246 MOVOA r4r3<>+0(SB), X0 247 248 MOVOA X1, X5 249 PCLMULQDQ $0, X0, X1 250 PCLMULQDQ $0x11, X0, X5 251 PXOR X5, X1 252 PXOR X2, X1 253 254 MOVOA X1, X5 255 PCLMULQDQ $0, X0, X1 256 PCLMULQDQ $0x11, X0, X5 257 PXOR X5, X1 258 PXOR X3, X1 259 260 MOVOA X1, X5 261 PCLMULQDQ $0, X0, X1 262 PCLMULQDQ $0x11, X0, X5 263 PXOR X5, X1 264 PXOR X4, X1 265 266 /* If there is less than 16 bytes left we are done */ 267 CMPQ CX, $16 268 JB finish 269 270 /* Encode 16 bytes */ 271 remain16: 272 MOVOU (SI), X10 273 MOVOA X1, X5 274 PCLMULQDQ $0, X0, X1 275 PCLMULQDQ $0x11, X0, X5 276 PXOR X5, X1 277 PXOR X10, X1 278 SUBQ $16, CX 279 ADDQ $16, SI 280 CMPQ CX, $16 281 JGE remain16 282 283 finish: 284 /* Fold final result into 32 bits and return it */ 285 PCMPEQB X3, X3 286 PCLMULQDQ $1, X1, X0 287 PSRLDQ $8, X1 288 PXOR X0, X1 289 290 MOVOA X1, X2 291 MOVQ r5<>+0(SB), X0 292 293 /* Creates 32 bit mask. Note that we don't care about upper half. */ 294 PSRLQ $32, X3 295 296 PSRLDQ $4, X2 297 PAND X3, X1 298 PCLMULQDQ $0, X0, X1 299 PXOR X2, X1 300 301 MOVOA rupoly<>+0(SB), X0 302 303 MOVOA X1, X2 304 PAND X3, X1 305 PCLMULQDQ $0x10, X0, X1 306 PAND X3, X1 307 PCLMULQDQ $0, X0, X1 308 PXOR X2, X1 309 310 PEXTRD $1, X1, AX 311 MOVL AX, ret+32(FP) 312 313 RET