github.com/mdempsky/go@v0.0.0-20151201204031-5dd372bd1e70/src/hash/crc32/crc32_amd64.s (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // func castagnoliSSE42(crc uint32, p []byte) uint32 8 TEXT ·castagnoliSSE42(SB),NOSPLIT,$0 9 MOVL crc+0(FP), AX // CRC value 10 MOVQ p+8(FP), SI // data pointer 11 MOVQ p_len+16(FP), CX // len(p) 12 13 NOTL AX 14 15 /* If there's less than 8 bytes to process, we do it byte-by-byte. */ 16 CMPQ CX, $8 17 JL cleanup 18 19 /* Process individual bytes until the input is 8-byte aligned. */ 20 startup: 21 MOVQ SI, BX 22 ANDQ $7, BX 23 JZ aligned 24 25 CRC32B (SI), AX 26 DECQ CX 27 INCQ SI 28 JMP startup 29 30 aligned: 31 /* The input is now 8-byte aligned and we can process 8-byte chunks. */ 32 CMPQ CX, $8 33 JL cleanup 34 35 CRC32Q (SI), AX 36 ADDQ $8, SI 37 SUBQ $8, CX 38 JMP aligned 39 40 cleanup: 41 /* We may have some bytes left over that we process one at a time. */ 42 CMPQ CX, $0 43 JE done 44 45 CRC32B (SI), AX 46 INCQ SI 47 DECQ CX 48 JMP cleanup 49 50 done: 51 NOTL AX 52 MOVL AX, ret+32(FP) 53 RET 54 55 // func haveSSE42() bool 56 TEXT ·haveSSE42(SB),NOSPLIT,$0 57 XORQ AX, AX 58 INCL AX 59 CPUID 60 SHRQ $20, CX 61 ANDQ $1, CX 62 MOVB CX, ret+0(FP) 63 RET 64 65 // func haveCLMUL() bool 66 TEXT ·haveCLMUL(SB),NOSPLIT,$0 67 XORQ AX, AX 68 INCL AX 69 CPUID 70 SHRQ $1, CX 71 ANDQ $1, CX 72 MOVB CX, ret+0(FP) 73 RET 74 75 // func haveSSE41() bool 76 TEXT ·haveSSE41(SB),NOSPLIT,$0 77 XORQ AX, AX 78 INCL AX 79 CPUID 80 SHRQ $19, CX 81 ANDQ $1, CX 82 MOVB CX, ret+0(FP) 83 RET 84 85 // CRC32 polynomial data 86 // 87 // These constants are lifted from the 88 // Linux kernel, since they avoid the costly 89 // PSHUFB 16 byte reversal proposed in the 90 // original Intel paper. 91 DATA r2r1<>+0(SB)/8, $0x154442bd4 92 DATA r2r1<>+8(SB)/8, $0x1c6e41596 93 DATA r4r3<>+0(SB)/8, $0x1751997d0 94 DATA r4r3<>+8(SB)/8, $0x0ccaa009e 95 DATA rupoly<>+0(SB)/8, $0x1db710641 96 DATA rupoly<>+8(SB)/8, $0x1f7011641 97 DATA r5<>+0(SB)/8, $0x163cd6124 98 99 GLOBL r2r1<>(SB),RODATA,$16 100 GLOBL r4r3<>(SB),RODATA,$16 101 GLOBL rupoly<>(SB),RODATA,$16 102 GLOBL r5<>(SB),RODATA,$8 103 104 // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 105 // len(p) must be at least 64, and must be a multiple of 16. 106 107 // func ieeeCLMUL(crc uint32, p []byte) uint32 108 TEXT ·ieeeCLMUL(SB),NOSPLIT,$0 109 MOVL crc+0(FP), X0 // Initial CRC value 110 MOVQ p+8(FP), SI // data pointer 111 MOVQ p_len+16(FP), CX // len(p) 112 113 MOVOU (SI), X1 114 MOVOU 16(SI), X2 115 MOVOU 32(SI), X3 116 MOVOU 48(SI), X4 117 PXOR X0, X1 118 ADDQ $64, SI // buf+=64 119 SUBQ $64, CX // len-=64 120 CMPQ CX, $64 // Less than 64 bytes left 121 JB remain64 122 123 MOVOA r2r1<>+0(SB), X0 124 loopback64: 125 MOVOA X1, X5 126 MOVOA X2, X6 127 MOVOA X3, X7 128 MOVOA X4, X8 129 130 PCLMULQDQ $0, X0, X1 131 PCLMULQDQ $0, X0, X2 132 PCLMULQDQ $0, X0, X3 133 PCLMULQDQ $0, X0, X4 134 135 /* Load next early */ 136 MOVOU (SI), X11 137 MOVOU 16(SI), X12 138 MOVOU 32(SI), X13 139 MOVOU 48(SI), X14 140 141 PCLMULQDQ $0x11, X0, X5 142 PCLMULQDQ $0x11, X0, X6 143 PCLMULQDQ $0x11, X0, X7 144 PCLMULQDQ $0x11, X0, X8 145 146 PXOR X5, X1 147 PXOR X6, X2 148 PXOR X7, X3 149 PXOR X8, X4 150 151 PXOR X11, X1 152 PXOR X12, X2 153 PXOR X13, X3 154 PXOR X14, X4 155 156 ADDQ $0x40, DI 157 ADDQ $64, SI // buf+=64 158 SUBQ $64, CX // len-=64 159 CMPQ CX, $64 // Less than 64 bytes left? 160 JGE loopback64 161 162 /* Fold result into a single register (X1) */ 163 remain64: 164 MOVOA r4r3<>+0(SB), X0 165 166 MOVOA X1, X5 167 PCLMULQDQ $0, X0, X1 168 PCLMULQDQ $0x11, X0, X5 169 PXOR X5, X1 170 PXOR X2, X1 171 172 MOVOA X1, X5 173 PCLMULQDQ $0, X0, X1 174 PCLMULQDQ $0x11, X0, X5 175 PXOR X5, X1 176 PXOR X3, X1 177 178 MOVOA X1, X5 179 PCLMULQDQ $0, X0, X1 180 PCLMULQDQ $0x11, X0, X5 181 PXOR X5, X1 182 PXOR X4, X1 183 184 /* If there is less than 16 bytes left we are done */ 185 CMPQ CX, $16 186 JB finish 187 188 /* Encode 16 bytes */ 189 remain16: 190 MOVOU (SI), X10 191 MOVOA X1, X5 192 PCLMULQDQ $0, X0, X1 193 PCLMULQDQ $0x11, X0, X5 194 PXOR X5, X1 195 PXOR X10, X1 196 SUBQ $16, CX 197 ADDQ $16, SI 198 CMPQ CX, $16 199 JGE remain16 200 201 finish: 202 /* Fold final result into 32 bits and return it */ 203 PCMPEQB X3, X3 204 PCLMULQDQ $1, X1, X0 205 PSRLDQ $8, X1 206 PXOR X0, X1 207 208 MOVOA X1, X2 209 MOVQ r5<>+0(SB), X0 210 211 /* Creates 32 bit mask. Note that we don't care about upper half. */ 212 PSRLQ $32, X3 213 214 PSRLDQ $4, X2 215 PAND X3, X1 216 PCLMULQDQ $0, X0, X1 217 PXOR X2, X1 218 219 MOVOA rupoly<>+0(SB), X0 220 221 MOVOA X1, X2 222 PAND X3, X1 223 PCLMULQDQ $0x10, X0, X1 224 PAND X3, X1 225 PCLMULQDQ $0, X0, X1 226 PXOR X2, X1 227 228 /* PEXTRD $1, X1, AX (SSE 4.1) */ 229 BYTE $0x66; BYTE $0x0f; BYTE $0x3a; 230 BYTE $0x16; BYTE $0xc8; BYTE $0x01; 231 MOVL AX, ret+32(FP) 232 233 RET