github.com/rakyll/go@v0.0.0-20170216000551-64c02460d703/src/hash/crc32/crc32_amd64.s (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
     8  //
     9  // func castagnoliSSE42(crc uint32, p []byte) uint32
    10  TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
    11  	MOVL crc+0(FP), AX  // CRC value
    12  	MOVQ p+8(FP), SI  // data pointer
    13  	MOVQ p_len+16(FP), CX  // len(p)
    14  
    15  	// If there are fewer than 8 bytes to process, skip alignment.
    16  	CMPQ CX, $8
    17  	JL less_than_8
    18  
    19  	MOVQ SI, BX
    20  	ANDQ $7, BX
    21  	JZ aligned
    22  
    23  	// Process the first few bytes to 8-byte align the input.
    24  
    25  	// BX = 8 - BX. We need to process this many bytes to align.
    26  	SUBQ $1, BX
    27  	XORQ $7, BX
    28  
    29  	BTQ $0, BX
    30  	JNC align_2
    31  
    32  	CRC32B (SI), AX
    33  	DECQ CX
    34  	INCQ SI
    35  
    36  align_2:
    37  	BTQ $1, BX
    38  	JNC align_4
    39  
    40  	// CRC32W (SI), AX
    41  	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    42  
    43  	SUBQ $2, CX
    44  	ADDQ $2, SI
    45  
    46  align_4:
    47  	BTQ $2, BX
    48  	JNC aligned
    49  
    50  	// CRC32L (SI), AX
    51  	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    52  
    53  	SUBQ $4, CX
    54  	ADDQ $4, SI
    55  
    56  aligned:
    57  	// The input is now 8-byte aligned and we can process 8-byte chunks.
    58  	CMPQ CX, $8
    59  	JL less_than_8
    60  
    61  	CRC32Q (SI), AX
    62  	ADDQ $8, SI
    63  	SUBQ $8, CX
    64  	JMP aligned
    65  
    66  less_than_8:
    67  	// We may have some bytes left over; process 4 bytes, then 2, then 1.
    68  	BTQ $2, CX
    69  	JNC less_than_4
    70  
    71  	// CRC32L (SI), AX
    72  	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    73  	ADDQ $4, SI
    74  
    75  less_than_4:
    76  	BTQ $1, CX
    77  	JNC less_than_2
    78  
    79  	// CRC32W (SI), AX
    80  	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    81  	ADDQ $2, SI
    82  
    83  less_than_2:
    84  	BTQ $0, CX
    85  	JNC done
    86  
    87  	CRC32B (SI), AX
    88  
    89  done:
    90  	MOVL AX, ret+32(FP)
    91  	RET
    92  
    93  // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
    94  // bytes from each buffer.
    95  //
    96  // func castagnoliSSE42Triple(
    97  //     crc1, crc2, crc3 uint32,
    98  //     a, b, c []byte,
    99  //     rounds uint32,
   100  // ) (retA uint32, retB uint32, retC uint32)
   101  TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
   102  	MOVL crcA+0(FP), AX
   103  	MOVL crcB+4(FP), CX
   104  	MOVL crcC+8(FP), DX
   105  
   106  	MOVQ a+16(FP), R8   // data pointer
   107  	MOVQ b+40(FP), R9   // data pointer
   108  	MOVQ c+64(FP), R10  // data pointer
   109  
   110  	MOVL rounds+88(FP), R11
   111  
   112  loop:
   113  	CRC32Q (R8), AX
   114  	CRC32Q (R9), CX
   115  	CRC32Q (R10), DX
   116  
   117  	CRC32Q 8(R8), AX
   118  	CRC32Q 8(R9), CX
   119  	CRC32Q 8(R10), DX
   120  
   121  	CRC32Q 16(R8), AX
   122  	CRC32Q 16(R9), CX
   123  	CRC32Q 16(R10), DX
   124  
   125  	ADDQ $24, R8
   126  	ADDQ $24, R9
   127  	ADDQ $24, R10
   128  
   129  	DECQ R11
   130  	JNZ loop
   131  
   132  	MOVL AX, retA+96(FP)
   133  	MOVL CX, retB+100(FP)
   134  	MOVL DX, retC+104(FP)
   135  	RET
   136  
   137  // func haveSSE42() bool
   138  TEXT ·haveSSE42(SB),NOSPLIT,$0
   139  	XORQ AX, AX
   140  	INCL AX
   141  	CPUID
   142  	SHRQ $20, CX
   143  	ANDQ $1, CX
   144  	MOVB CX, ret+0(FP)
   145  	RET
   146  
   147  // func haveCLMUL() bool
   148  TEXT ·haveCLMUL(SB),NOSPLIT,$0
   149  	XORQ AX, AX
   150  	INCL AX
   151  	CPUID
   152  	SHRQ $1, CX
   153  	ANDQ $1, CX
   154  	MOVB CX, ret+0(FP)
   155  	RET
   156  
   157  // func haveSSE41() bool
   158  TEXT ·haveSSE41(SB),NOSPLIT,$0
   159  	XORQ AX, AX
   160  	INCL AX
   161  	CPUID
   162  	SHRQ $19, CX
   163  	ANDQ $1, CX
   164  	MOVB CX, ret+0(FP)
   165  	RET
   166  
   167  // CRC32 polynomial data
   168  //
   169  // These constants are lifted from the
   170  // Linux kernel, since they avoid the costly
   171  // PSHUFB 16 byte reversal proposed in the
   172  // original Intel paper.
   173  DATA r2r1<>+0(SB)/8, $0x154442bd4
   174  DATA r2r1<>+8(SB)/8, $0x1c6e41596
   175  DATA r4r3<>+0(SB)/8, $0x1751997d0
   176  DATA r4r3<>+8(SB)/8, $0x0ccaa009e
   177  DATA rupoly<>+0(SB)/8, $0x1db710641
   178  DATA rupoly<>+8(SB)/8, $0x1f7011641
   179  DATA r5<>+0(SB)/8, $0x163cd6124
   180  
   181  GLOBL r2r1<>(SB),RODATA,$16
   182  GLOBL r4r3<>(SB),RODATA,$16
   183  GLOBL rupoly<>(SB),RODATA,$16
   184  GLOBL r5<>(SB),RODATA,$8
   185  
   186  // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
   187  // len(p) must be at least 64, and must be a multiple of 16.
   188  
   189  // func ieeeCLMUL(crc uint32, p []byte) uint32
   190  TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
   191  	MOVL   crc+0(FP), X0             // Initial CRC value
   192  	MOVQ   p+8(FP), SI  	         // data pointer
   193  	MOVQ   p_len+16(FP), CX          // len(p)
   194  
   195  	MOVOU  (SI), X1
   196  	MOVOU  16(SI), X2
   197  	MOVOU  32(SI), X3
   198  	MOVOU  48(SI), X4
   199  	PXOR   X0, X1
   200  	ADDQ   $64, SI                  // buf+=64
   201  	SUBQ   $64, CX                  // len-=64
   202  	CMPQ   CX, $64                  // Less than 64 bytes left
   203  	JB     remain64
   204  
   205  	MOVOA  r2r1<>+0(SB), X0
   206  loopback64:
   207  	MOVOA  X1, X5
   208  	MOVOA  X2, X6
   209  	MOVOA  X3, X7
   210  	MOVOA  X4, X8
   211  
   212  	PCLMULQDQ $0, X0, X1
   213  	PCLMULQDQ $0, X0, X2
   214  	PCLMULQDQ $0, X0, X3
   215  	PCLMULQDQ $0, X0, X4
   216  
   217  	/* Load next early */
   218  	MOVOU    (SI), X11
   219  	MOVOU    16(SI), X12
   220  	MOVOU    32(SI), X13
   221  	MOVOU    48(SI), X14
   222  
   223  	PCLMULQDQ $0x11, X0, X5
   224  	PCLMULQDQ $0x11, X0, X6
   225  	PCLMULQDQ $0x11, X0, X7
   226  	PCLMULQDQ $0x11, X0, X8
   227  
   228  	PXOR     X5, X1
   229  	PXOR     X6, X2
   230  	PXOR     X7, X3
   231  	PXOR     X8, X4
   232  
   233  	PXOR     X11, X1
   234  	PXOR     X12, X2
   235  	PXOR     X13, X3
   236  	PXOR     X14, X4
   237  
   238  	ADDQ    $0x40, DI
   239  	ADDQ    $64, SI      // buf+=64
   240  	SUBQ    $64, CX      // len-=64
   241  	CMPQ    CX, $64      // Less than 64 bytes left?
   242  	JGE     loopback64
   243  
   244  	/* Fold result into a single register (X1) */
   245  remain64:
   246  	MOVOA       r4r3<>+0(SB), X0
   247  
   248  	MOVOA       X1, X5
   249  	PCLMULQDQ   $0, X0, X1
   250  	PCLMULQDQ   $0x11, X0, X5
   251  	PXOR        X5, X1
   252  	PXOR        X2, X1
   253  
   254  	MOVOA       X1, X5
   255  	PCLMULQDQ   $0, X0, X1
   256  	PCLMULQDQ   $0x11, X0, X5
   257  	PXOR        X5, X1
   258  	PXOR        X3, X1
   259  
   260  	MOVOA       X1, X5
   261  	PCLMULQDQ   $0, X0, X1
   262  	PCLMULQDQ   $0x11, X0, X5
   263  	PXOR        X5, X1
   264  	PXOR        X4, X1
   265  
   266  	/* If there is less than 16 bytes left we are done */
   267  	CMPQ        CX, $16
   268  	JB          finish
   269  
   270  	/* Encode 16 bytes */
   271  remain16:
   272  	MOVOU       (SI), X10
   273  	MOVOA       X1, X5
   274  	PCLMULQDQ   $0, X0, X1
   275  	PCLMULQDQ   $0x11, X0, X5
   276  	PXOR        X5, X1
   277  	PXOR        X10, X1
   278  	SUBQ        $16, CX
   279  	ADDQ        $16, SI
   280  	CMPQ        CX, $16
   281  	JGE         remain16
   282  
   283  finish:
   284  	/* Fold final result into 32 bits and return it */
   285  	PCMPEQB     X3, X3
   286  	PCLMULQDQ   $1, X1, X0
   287  	PSRLDQ      $8, X1
   288  	PXOR        X0, X1
   289  
   290  	MOVOA       X1, X2
   291  	MOVQ        r5<>+0(SB), X0
   292  
   293  	/* Creates 32 bit mask. Note that we don't care about upper half. */
   294  	PSRLQ       $32, X3
   295  
   296  	PSRLDQ      $4, X2
   297  	PAND        X3, X1
   298  	PCLMULQDQ   $0, X0, X1
   299  	PXOR        X2, X1
   300  
   301  	MOVOA       rupoly<>+0(SB), X0
   302  
   303  	MOVOA       X1, X2
   304  	PAND        X3, X1
   305  	PCLMULQDQ   $0x10, X0, X1
   306  	PAND        X3, X1
   307  	PCLMULQDQ   $0, X0, X1
   308  	PXOR        X2, X1
   309  
   310  	PEXTRD	$1, X1, AX
   311  	MOVL        AX, ret+32(FP)
   312  
   313  	RET