github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/hash/crc32/crc32_amd64.s (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer. 8 // 9 // func castagnoliSSE42(crc uint32, p []byte) uint32 10 TEXT ·castagnoliSSE42(SB),NOSPLIT,$0 11 MOVL crc+0(FP), AX // CRC value 12 MOVQ p+8(FP), SI // data pointer 13 MOVQ p_len+16(FP), CX // len(p) 14 15 // If there are fewer than 8 bytes to process, skip alignment. 16 CMPQ CX, $8 17 JL less_than_8 18 19 MOVQ SI, BX 20 ANDQ $7, BX 21 JZ aligned 22 23 // Process the first few bytes to 8-byte align the input. 24 25 // BX = 8 - BX. We need to process this many bytes to align. 26 SUBQ $1, BX 27 XORQ $7, BX 28 29 BTQ $0, BX 30 JNC align_2 31 32 CRC32B (SI), AX 33 DECQ CX 34 INCQ SI 35 36 align_2: 37 BTQ $1, BX 38 JNC align_4 39 40 // CRC32W (SI), AX 41 BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 42 43 SUBQ $2, CX 44 ADDQ $2, SI 45 46 align_4: 47 BTQ $2, BX 48 JNC aligned 49 50 // CRC32L (SI), AX 51 BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 52 53 SUBQ $4, CX 54 ADDQ $4, SI 55 56 aligned: 57 // The input is now 8-byte aligned and we can process 8-byte chunks. 58 CMPQ CX, $8 59 JL less_than_8 60 61 CRC32Q (SI), AX 62 ADDQ $8, SI 63 SUBQ $8, CX 64 JMP aligned 65 66 less_than_8: 67 // We may have some bytes left over; process 4 bytes, then 2, then 1. 68 BTQ $2, CX 69 JNC less_than_4 70 71 // CRC32L (SI), AX 72 BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 73 ADDQ $4, SI 74 75 less_than_4: 76 BTQ $1, CX 77 JNC less_than_2 78 79 // CRC32W (SI), AX 80 BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 81 ADDQ $2, SI 82 83 less_than_2: 84 BTQ $0, CX 85 JNC done 86 87 CRC32B (SI), AX 88 89 done: 90 MOVL AX, ret+32(FP) 91 RET 92 93 // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) 94 // bytes from each buffer. 95 // 96 // func castagnoliSSE42Triple( 97 // crc1, crc2, crc3 uint32, 98 // a, b, c []byte, 99 // rounds uint32, 100 // ) (retA uint32, retB uint32, retC uint32) 101 TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0 102 MOVL crcA+0(FP), AX 103 MOVL crcB+4(FP), CX 104 MOVL crcC+8(FP), DX 105 106 MOVQ a+16(FP), R8 // data pointer 107 MOVQ b+40(FP), R9 // data pointer 108 MOVQ c+64(FP), R10 // data pointer 109 110 MOVL rounds+88(FP), R11 111 112 loop: 113 CRC32Q (R8), AX 114 CRC32Q (R9), CX 115 CRC32Q (R10), DX 116 117 CRC32Q 8(R8), AX 118 CRC32Q 8(R9), CX 119 CRC32Q 8(R10), DX 120 121 CRC32Q 16(R8), AX 122 CRC32Q 16(R9), CX 123 CRC32Q 16(R10), DX 124 125 ADDQ $24, R8 126 ADDQ $24, R9 127 ADDQ $24, R10 128 129 DECQ R11 130 JNZ loop 131 132 MOVL AX, retA+96(FP) 133 MOVL CX, retB+100(FP) 134 MOVL DX, retC+104(FP) 135 RET 136 137 // CRC32 polynomial data 138 // 139 // These constants are lifted from the 140 // Linux kernel, since they avoid the costly 141 // PSHUFB 16 byte reversal proposed in the 142 // original Intel paper. 143 DATA r2r1<>+0(SB)/8, $0x154442bd4 144 DATA r2r1<>+8(SB)/8, $0x1c6e41596 145 DATA r4r3<>+0(SB)/8, $0x1751997d0 146 DATA r4r3<>+8(SB)/8, $0x0ccaa009e 147 DATA rupoly<>+0(SB)/8, $0x1db710641 148 DATA rupoly<>+8(SB)/8, $0x1f7011641 149 DATA r5<>+0(SB)/8, $0x163cd6124 150 151 GLOBL r2r1<>(SB),RODATA,$16 152 GLOBL r4r3<>(SB),RODATA,$16 153 GLOBL rupoly<>(SB),RODATA,$16 154 GLOBL r5<>(SB),RODATA,$8 155 156 // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 157 // len(p) must be at least 64, and must be a multiple of 16. 158 159 // func ieeeCLMUL(crc uint32, p []byte) uint32 160 TEXT ·ieeeCLMUL(SB),NOSPLIT,$0 161 MOVL crc+0(FP), X0 // Initial CRC value 162 MOVQ p+8(FP), SI // data pointer 163 MOVQ p_len+16(FP), CX // len(p) 164 165 MOVOU (SI), X1 166 MOVOU 16(SI), X2 167 MOVOU 32(SI), X3 168 MOVOU 48(SI), X4 169 PXOR X0, X1 170 ADDQ $64, SI // buf+=64 171 SUBQ $64, CX // len-=64 172 CMPQ CX, $64 // Less than 64 bytes left 173 JB remain64 174 175 MOVOA r2r1<>+0(SB), X0 176 loopback64: 177 MOVOA X1, X5 178 MOVOA X2, X6 179 MOVOA X3, X7 180 MOVOA X4, X8 181 182 PCLMULQDQ $0, X0, X1 183 PCLMULQDQ $0, X0, X2 184 PCLMULQDQ $0, X0, X3 185 PCLMULQDQ $0, X0, X4 186 187 /* Load next early */ 188 MOVOU (SI), X11 189 MOVOU 16(SI), X12 190 MOVOU 32(SI), X13 191 MOVOU 48(SI), X14 192 193 PCLMULQDQ $0x11, X0, X5 194 PCLMULQDQ $0x11, X0, X6 195 PCLMULQDQ $0x11, X0, X7 196 PCLMULQDQ $0x11, X0, X8 197 198 PXOR X5, X1 199 PXOR X6, X2 200 PXOR X7, X3 201 PXOR X8, X4 202 203 PXOR X11, X1 204 PXOR X12, X2 205 PXOR X13, X3 206 PXOR X14, X4 207 208 ADDQ $0x40, DI 209 ADDQ $64, SI // buf+=64 210 SUBQ $64, CX // len-=64 211 CMPQ CX, $64 // Less than 64 bytes left? 212 JGE loopback64 213 214 /* Fold result into a single register (X1) */ 215 remain64: 216 MOVOA r4r3<>+0(SB), X0 217 218 MOVOA X1, X5 219 PCLMULQDQ $0, X0, X1 220 PCLMULQDQ $0x11, X0, X5 221 PXOR X5, X1 222 PXOR X2, X1 223 224 MOVOA X1, X5 225 PCLMULQDQ $0, X0, X1 226 PCLMULQDQ $0x11, X0, X5 227 PXOR X5, X1 228 PXOR X3, X1 229 230 MOVOA X1, X5 231 PCLMULQDQ $0, X0, X1 232 PCLMULQDQ $0x11, X0, X5 233 PXOR X5, X1 234 PXOR X4, X1 235 236 /* If there is less than 16 bytes left we are done */ 237 CMPQ CX, $16 238 JB finish 239 240 /* Encode 16 bytes */ 241 remain16: 242 MOVOU (SI), X10 243 MOVOA X1, X5 244 PCLMULQDQ $0, X0, X1 245 PCLMULQDQ $0x11, X0, X5 246 PXOR X5, X1 247 PXOR X10, X1 248 SUBQ $16, CX 249 ADDQ $16, SI 250 CMPQ CX, $16 251 JGE remain16 252 253 finish: 254 /* Fold final result into 32 bits and return it */ 255 PCMPEQB X3, X3 256 PCLMULQDQ $1, X1, X0 257 PSRLDQ $8, X1 258 PXOR X0, X1 259 260 MOVOA X1, X2 261 MOVQ r5<>+0(SB), X0 262 263 /* Creates 32 bit mask. Note that we don't care about upper half. */ 264 PSRLQ $32, X3 265 266 PSRLDQ $4, X2 267 PAND X3, X1 268 PCLMULQDQ $0, X0, X1 269 PXOR X2, X1 270 271 MOVOA rupoly<>+0(SB), X0 272 273 MOVOA X1, X2 274 PAND X3, X1 275 PCLMULQDQ $0x10, X0, X1 276 PAND X3, X1 277 PCLMULQDQ $0, X0, X1 278 PXOR X2, X1 279 280 PEXTRD $1, X1, AX 281 MOVL AX, ret+32(FP) 282 283 RET