github.com/devops-filetransfer/sshego@v7.0.4+incompatible/_vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!gccgo,!appengine
     6  
     7  #include "textflag.h"
     8  
     9  DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
    10  DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
    11  GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
    12  
    13  DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
    14  DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
    15  GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
    16  
    17  DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
    18  DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
    19  GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
    20  
    21  DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
    22  DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
    23  GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
    24  
    25  DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
    26  DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
    27  GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
    28  
    29  DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
    30  DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
    31  GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
    32  
    33  #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
    34  	MOVO       v4, t1; \
    35  	MOVO       v5, v4; \
    36  	MOVO       t1, v5; \
    37  	MOVO       v6, t1; \
    38  	PUNPCKLQDQ v6, t2; \
    39  	PUNPCKHQDQ v7, v6; \
    40  	PUNPCKHQDQ t2, v6; \
    41  	PUNPCKLQDQ v7, t2; \
    42  	MOVO       t1, v7; \
    43  	MOVO       v2, t1; \
    44  	PUNPCKHQDQ t2, v7; \
    45  	PUNPCKLQDQ v3, t2; \
    46  	PUNPCKHQDQ t2, v2; \
    47  	PUNPCKLQDQ t1, t2; \
    48  	PUNPCKHQDQ t2, v3
    49  
    50  #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
    51  	MOVO       v4, t1; \
    52  	MOVO       v5, v4; \
    53  	MOVO       t1, v5; \
    54  	MOVO       v2, t1; \
    55  	PUNPCKLQDQ v2, t2; \
    56  	PUNPCKHQDQ v3, v2; \
    57  	PUNPCKHQDQ t2, v2; \
    58  	PUNPCKLQDQ v3, t2; \
    59  	MOVO       t1, v3; \
    60  	MOVO       v6, t1; \
    61  	PUNPCKHQDQ t2, v3; \
    62  	PUNPCKLQDQ v7, t2; \
    63  	PUNPCKHQDQ t2, v6; \
    64  	PUNPCKLQDQ t1, t2; \
    65  	PUNPCKHQDQ t2, v7
    66  
    67  #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
    68  	PADDQ  m0, v0;        \
    69  	PADDQ  m1, v1;        \
    70  	PADDQ  v2, v0;        \
    71  	PADDQ  v3, v1;        \
    72  	PXOR   v0, v6;        \
    73  	PXOR   v1, v7;        \
    74  	PSHUFD $0xB1, v6, v6; \
    75  	PSHUFD $0xB1, v7, v7; \
    76  	PADDQ  v6, v4;        \
    77  	PADDQ  v7, v5;        \
    78  	PXOR   v4, v2;        \
    79  	PXOR   v5, v3;        \
    80  	PSHUFB c40, v2;       \
    81  	PSHUFB c40, v3;       \
    82  	PADDQ  m2, v0;        \
    83  	PADDQ  m3, v1;        \
    84  	PADDQ  v2, v0;        \
    85  	PADDQ  v3, v1;        \
    86  	PXOR   v0, v6;        \
    87  	PXOR   v1, v7;        \
    88  	PSHUFB c48, v6;       \
    89  	PSHUFB c48, v7;       \
    90  	PADDQ  v6, v4;        \
    91  	PADDQ  v7, v5;        \
    92  	PXOR   v4, v2;        \
    93  	PXOR   v5, v3;        \
    94  	MOVOU  v2, t0;        \
    95  	PADDQ  v2, t0;        \
    96  	PSRLQ  $63, v2;       \
    97  	PXOR   t0, v2;        \
    98  	MOVOU  v3, t0;        \
    99  	PADDQ  v3, t0;        \
   100  	PSRLQ  $63, v3;       \
   101  	PXOR   t0, v3
   102  
   103  #define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
   104  	MOVQ   i0*8(src), m0;     \
   105  	PINSRQ $1, i1*8(src), m0; \
   106  	MOVQ   i2*8(src), m1;     \
   107  	PINSRQ $1, i3*8(src), m1; \
   108  	MOVQ   i4*8(src), m2;     \
   109  	PINSRQ $1, i5*8(src), m2; \
   110  	MOVQ   i6*8(src), m3;     \
   111  	PINSRQ $1, i7*8(src), m3
   112  
   113  // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
   114  TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
   115  	MOVQ h+0(FP), AX
   116  	MOVQ c+8(FP), BX
   117  	MOVQ flag+16(FP), CX
   118  	MOVQ blocks_base+24(FP), SI
   119  	MOVQ blocks_len+32(FP), DI
   120  
   121  	MOVQ SP, BP
   122  	MOVQ SP, R9
   123  	ADDQ $15, R9
   124  	ANDQ $~15, R9
   125  	MOVQ R9, SP
   126  
   127  	MOVOU ·iv3<>(SB), X0
   128  	MOVO  X0, 0(SP)
   129  	XORQ  CX, 0(SP)     // 0(SP) = ·iv3 ^ (CX || 0)
   130  
   131  	MOVOU ·c40<>(SB), X13
   132  	MOVOU ·c48<>(SB), X14
   133  
   134  	MOVOU 0(AX), X12
   135  	MOVOU 16(AX), X15
   136  
   137  	MOVQ 0(BX), R8
   138  	MOVQ 8(BX), R9
   139  
   140  loop:
   141  	ADDQ $128, R8
   142  	CMPQ R8, $128
   143  	JGE  noinc
   144  	INCQ R9
   145  
   146  noinc:
   147  	MOVQ R8, X8
   148  	PINSRQ $1, R9, X8
   149  
   150  	MOVO X12, X0
   151  	MOVO X15, X1
   152  	MOVOU 32(AX), X2
   153  	MOVOU 48(AX), X3
   154  	MOVOU ·iv0<>(SB), X4
   155  	MOVOU ·iv1<>(SB), X5
   156  	MOVOU ·iv2<>(SB), X6
   157  
   158  	PXOR X8, X6
   159  	MOVO 0(SP), X7
   160  
   161  	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
   162  	MOVO X8, 16(SP)
   163  	MOVO X9, 32(SP)
   164  	MOVO X10, 48(SP)
   165  	MOVO X11, 64(SP)
   166  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   167  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   168  	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
   169  	MOVO X8, 80(SP)
   170  	MOVO X9, 96(SP)
   171  	MOVO X10, 112(SP)
   172  	MOVO X11, 128(SP)
   173  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   174  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   175  
   176  	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
   177  	MOVO X8, 144(SP)
   178  	MOVO X9, 160(SP)
   179  	MOVO X10, 176(SP)
   180  	MOVO X11, 192(SP)
   181  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   182  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   183  	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
   184  	MOVO X8, 208(SP)
   185  	MOVO X9, 224(SP)
   186  	MOVO X10, 240(SP)
   187  	MOVO X11, 256(SP)
   188  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   189  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   190  
   191  	LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
   192  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   193  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   194  	LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
   195  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   196  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   197  
   198  	LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
   199  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   200  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   201  	LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
   202  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   203  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   204  
   205  	LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
   206  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   207  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   208  	LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
   209  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   210  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   211  
   212  	LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
   213  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   214  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   215  	LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
   216  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   217  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   218  
   219  	LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
   220  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   221  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   222  	LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
   223  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   224  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   225  
   226  	LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
   227  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   228  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   229  	LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
   230  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   231  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   232  
   233  	LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
   234  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   235  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   236  	LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
   237  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   238  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   239  
   240  	LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
   241  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   242  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   243  	LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
   244  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   245  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   246  
   247  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X11, X13, X14)
   248  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   249  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X11, X13, X14)
   250  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   251  
   252  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X11, X13, X14)
   253  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   254  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X11, X13, X14)
   255  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   256  
   257  	MOVOU 32(AX), X10
   258  	MOVOU 48(AX), X11
   259  	PXOR  X0, X12
   260  	PXOR  X1, X15
   261  	PXOR  X2, X10
   262  	PXOR  X3, X11
   263  	PXOR  X4, X12
   264  	PXOR  X5, X15
   265  	PXOR  X6, X10
   266  	PXOR  X7, X11
   267  	MOVOU X10, 32(AX)
   268  	MOVOU X11, 48(AX)
   269  
   270  	LEAQ 128(SI), SI
   271  	SUBQ $128, DI
   272  	JNE  loop
   273  
   274  	MOVOU X12, 0(AX)
   275  	MOVOU X15, 16(AX)
   276  
   277  	MOVQ R8, 0(BX)
   278  	MOVQ R9, 8(BX)
   279  
   280  	MOVQ BP, SP
   281  	RET
   282  
   283  // func supportsSSE4() bool
   284  TEXT ·supportsSSE4(SB), 4, $0-1
   285  	MOVL $1, AX
   286  	CPUID
   287  	SHRL $19, CX  // Bit 19 indicates SSE4 support
   288  	ANDL $1, CX  // CX != 0 if support SSE4
   289  	MOVB CX, ret+0(FP)
   290  	RET