github.com/corona10/go@v0.0.0-20180224231303-7a218942be57/src/hash/crc32/crc32_amd64.s (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
     8  //
     9  // func castagnoliSSE42(crc uint32, p []byte) uint32
    10  TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
    11  	MOVL crc+0(FP), AX  // CRC value
    12  	MOVQ p+8(FP), SI  // data pointer
    13  	MOVQ p_len+16(FP), CX  // len(p)
    14  
    15  	// If there are fewer than 8 bytes to process, skip alignment.
    16  	CMPQ CX, $8
    17  	JL less_than_8
    18  
    19  	MOVQ SI, BX
    20  	ANDQ $7, BX
    21  	JZ aligned
    22  
    23  	// Process the first few bytes to 8-byte align the input.
    24  
    25  	// BX = 8 - BX. We need to process this many bytes to align.
    26  	SUBQ $1, BX
    27  	XORQ $7, BX
    28  
    29  	BTQ $0, BX
    30  	JNC align_2
    31  
    32  	CRC32B (SI), AX
    33  	DECQ CX
    34  	INCQ SI
    35  
    36  align_2:
    37  	BTQ $1, BX
    38  	JNC align_4
    39  
    40  	// CRC32W (SI), AX
    41  	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    42  
    43  	SUBQ $2, CX
    44  	ADDQ $2, SI
    45  
    46  align_4:
    47  	BTQ $2, BX
    48  	JNC aligned
    49  
    50  	// CRC32L (SI), AX
    51  	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    52  
    53  	SUBQ $4, CX
    54  	ADDQ $4, SI
    55  
    56  aligned:
    57  	// The input is now 8-byte aligned and we can process 8-byte chunks.
    58  	CMPQ CX, $8
    59  	JL less_than_8
    60  
    61  	CRC32Q (SI), AX
    62  	ADDQ $8, SI
    63  	SUBQ $8, CX
    64  	JMP aligned
    65  
    66  less_than_8:
    67  	// We may have some bytes left over; process 4 bytes, then 2, then 1.
    68  	BTQ $2, CX
    69  	JNC less_than_4
    70  
    71  	// CRC32L (SI), AX
    72  	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    73  	ADDQ $4, SI
    74  
    75  less_than_4:
    76  	BTQ $1, CX
    77  	JNC less_than_2
    78  
    79  	// CRC32W (SI), AX
    80  	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
    81  	ADDQ $2, SI
    82  
    83  less_than_2:
    84  	BTQ $0, CX
    85  	JNC done
    86  
    87  	CRC32B (SI), AX
    88  
    89  done:
    90  	MOVL AX, ret+32(FP)
    91  	RET
    92  
    93  // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
    94  // bytes from each buffer.
    95  //
    96  // func castagnoliSSE42Triple(
    97  //     crc1, crc2, crc3 uint32,
    98  //     a, b, c []byte,
    99  //     rounds uint32,
   100  // ) (retA uint32, retB uint32, retC uint32)
   101  TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
   102  	MOVL crcA+0(FP), AX
   103  	MOVL crcB+4(FP), CX
   104  	MOVL crcC+8(FP), DX
   105  
   106  	MOVQ a+16(FP), R8   // data pointer
   107  	MOVQ b+40(FP), R9   // data pointer
   108  	MOVQ c+64(FP), R10  // data pointer
   109  
   110  	MOVL rounds+88(FP), R11
   111  
   112  loop:
   113  	CRC32Q (R8), AX
   114  	CRC32Q (R9), CX
   115  	CRC32Q (R10), DX
   116  
   117  	CRC32Q 8(R8), AX
   118  	CRC32Q 8(R9), CX
   119  	CRC32Q 8(R10), DX
   120  
   121  	CRC32Q 16(R8), AX
   122  	CRC32Q 16(R9), CX
   123  	CRC32Q 16(R10), DX
   124  
   125  	ADDQ $24, R8
   126  	ADDQ $24, R9
   127  	ADDQ $24, R10
   128  
   129  	DECQ R11
   130  	JNZ loop
   131  
   132  	MOVL AX, retA+96(FP)
   133  	MOVL CX, retB+100(FP)
   134  	MOVL DX, retC+104(FP)
   135  	RET
   136  
   137  // CRC32 polynomial data
   138  //
   139  // These constants are lifted from the
   140  // Linux kernel, since they avoid the costly
   141  // PSHUFB 16 byte reversal proposed in the
   142  // original Intel paper.
   143  DATA r2r1<>+0(SB)/8, $0x154442bd4
   144  DATA r2r1<>+8(SB)/8, $0x1c6e41596
   145  DATA r4r3<>+0(SB)/8, $0x1751997d0
   146  DATA r4r3<>+8(SB)/8, $0x0ccaa009e
   147  DATA rupoly<>+0(SB)/8, $0x1db710641
   148  DATA rupoly<>+8(SB)/8, $0x1f7011641
   149  DATA r5<>+0(SB)/8, $0x163cd6124
   150  
   151  GLOBL r2r1<>(SB),RODATA,$16
   152  GLOBL r4r3<>(SB),RODATA,$16
   153  GLOBL rupoly<>(SB),RODATA,$16
   154  GLOBL r5<>(SB),RODATA,$8
   155  
   156  // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
   157  // len(p) must be at least 64, and must be a multiple of 16.
   158  
   159  // func ieeeCLMUL(crc uint32, p []byte) uint32
   160  TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
   161  	MOVL   crc+0(FP), X0             // Initial CRC value
   162  	MOVQ   p+8(FP), SI  	         // data pointer
   163  	MOVQ   p_len+16(FP), CX          // len(p)
   164  
   165  	MOVOU  (SI), X1
   166  	MOVOU  16(SI), X2
   167  	MOVOU  32(SI), X3
   168  	MOVOU  48(SI), X4
   169  	PXOR   X0, X1
   170  	ADDQ   $64, SI                  // buf+=64
   171  	SUBQ   $64, CX                  // len-=64
   172  	CMPQ   CX, $64                  // Less than 64 bytes left
   173  	JB     remain64
   174  
   175  	MOVOA  r2r1<>+0(SB), X0
   176  loopback64:
   177  	MOVOA  X1, X5
   178  	MOVOA  X2, X6
   179  	MOVOA  X3, X7
   180  	MOVOA  X4, X8
   181  
   182  	PCLMULQDQ $0, X0, X1
   183  	PCLMULQDQ $0, X0, X2
   184  	PCLMULQDQ $0, X0, X3
   185  	PCLMULQDQ $0, X0, X4
   186  
   187  	/* Load next early */
   188  	MOVOU    (SI), X11
   189  	MOVOU    16(SI), X12
   190  	MOVOU    32(SI), X13
   191  	MOVOU    48(SI), X14
   192  
   193  	PCLMULQDQ $0x11, X0, X5
   194  	PCLMULQDQ $0x11, X0, X6
   195  	PCLMULQDQ $0x11, X0, X7
   196  	PCLMULQDQ $0x11, X0, X8
   197  
   198  	PXOR     X5, X1
   199  	PXOR     X6, X2
   200  	PXOR     X7, X3
   201  	PXOR     X8, X4
   202  
   203  	PXOR     X11, X1
   204  	PXOR     X12, X2
   205  	PXOR     X13, X3
   206  	PXOR     X14, X4
   207  
   208  	ADDQ    $0x40, DI
   209  	ADDQ    $64, SI      // buf+=64
   210  	SUBQ    $64, CX      // len-=64
   211  	CMPQ    CX, $64      // Less than 64 bytes left?
   212  	JGE     loopback64
   213  
   214  	/* Fold result into a single register (X1) */
   215  remain64:
   216  	MOVOA       r4r3<>+0(SB), X0
   217  
   218  	MOVOA       X1, X5
   219  	PCLMULQDQ   $0, X0, X1
   220  	PCLMULQDQ   $0x11, X0, X5
   221  	PXOR        X5, X1
   222  	PXOR        X2, X1
   223  
   224  	MOVOA       X1, X5
   225  	PCLMULQDQ   $0, X0, X1
   226  	PCLMULQDQ   $0x11, X0, X5
   227  	PXOR        X5, X1
   228  	PXOR        X3, X1
   229  
   230  	MOVOA       X1, X5
   231  	PCLMULQDQ   $0, X0, X1
   232  	PCLMULQDQ   $0x11, X0, X5
   233  	PXOR        X5, X1
   234  	PXOR        X4, X1
   235  
   236  	/* If there is less than 16 bytes left we are done */
   237  	CMPQ        CX, $16
   238  	JB          finish
   239  
   240  	/* Encode 16 bytes */
   241  remain16:
   242  	MOVOU       (SI), X10
   243  	MOVOA       X1, X5
   244  	PCLMULQDQ   $0, X0, X1
   245  	PCLMULQDQ   $0x11, X0, X5
   246  	PXOR        X5, X1
   247  	PXOR        X10, X1
   248  	SUBQ        $16, CX
   249  	ADDQ        $16, SI
   250  	CMPQ        CX, $16
   251  	JGE         remain16
   252  
   253  finish:
   254  	/* Fold final result into 32 bits and return it */
   255  	PCMPEQB     X3, X3
   256  	PCLMULQDQ   $1, X1, X0
   257  	PSRLDQ      $8, X1
   258  	PXOR        X0, X1
   259  
   260  	MOVOA       X1, X2
   261  	MOVQ        r5<>+0(SB), X0
   262  
   263  	/* Creates 32 bit mask. Note that we don't care about upper half. */
   264  	PSRLQ       $32, X3
   265  
   266  	PSRLDQ      $4, X2
   267  	PAND        X3, X1
   268  	PCLMULQDQ   $0, X0, X1
   269  	PXOR        X2, X1
   270  
   271  	MOVOA       rupoly<>+0(SB), X0
   272  
   273  	MOVOA       X1, X2
   274  	PAND        X3, X1
   275  	PCLMULQDQ   $0x10, X0, X1
   276  	PAND        X3, X1
   277  	PCLMULQDQ   $0, X0, X1
   278  	PXOR        X2, X1
   279  
   280  	PEXTRD	$1, X1, AX
   281  	MOVL        AX, ret+32(FP)
   282  
   283  	RET