github.com/neatio-net/neatio@v1.7.3-0.20231114194659-f4d7a2226baa/utilities/crypto/blake2b/blake2b_amd64.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!gccgo,!appengine
     6  
     7  #include "textflag.h"
     8  
     9  DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
    10  DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
    11  GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
    12  
    13  DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
    14  DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
    15  GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
    16  
    17  DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
    18  DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
    19  GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
    20  
    21  DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
    22  DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
    23  GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
    24  
    25  DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
    26  DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
    27  GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
    28  
    29  DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
    30  DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
    31  GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
    32  
    33  #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
    34  	MOVO       v4, t1; \
    35  	MOVO       v5, v4; \
    36  	MOVO       t1, v5; \
    37  	MOVO       v6, t1; \
    38  	PUNPCKLQDQ v6, t2; \
    39  	PUNPCKHQDQ v7, v6; \
    40  	PUNPCKHQDQ t2, v6; \
    41  	PUNPCKLQDQ v7, t2; \
    42  	MOVO       t1, v7; \
    43  	MOVO       v2, t1; \
    44  	PUNPCKHQDQ t2, v7; \
    45  	PUNPCKLQDQ v3, t2; \
    46  	PUNPCKHQDQ t2, v2; \
    47  	PUNPCKLQDQ t1, t2; \
    48  	PUNPCKHQDQ t2, v3
    49  
    50  #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
    51  	MOVO       v4, t1; \
    52  	MOVO       v5, v4; \
    53  	MOVO       t1, v5; \
    54  	MOVO       v2, t1; \
    55  	PUNPCKLQDQ v2, t2; \
    56  	PUNPCKHQDQ v3, v2; \
    57  	PUNPCKHQDQ t2, v2; \
    58  	PUNPCKLQDQ v3, t2; \
    59  	MOVO       t1, v3; \
    60  	MOVO       v6, t1; \
    61  	PUNPCKHQDQ t2, v3; \
    62  	PUNPCKLQDQ v7, t2; \
    63  	PUNPCKHQDQ t2, v6; \
    64  	PUNPCKLQDQ t1, t2; \
    65  	PUNPCKHQDQ t2, v7
    66  
    67  #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
    68  	PADDQ  m0, v0;        \
    69  	PADDQ  m1, v1;        \
    70  	PADDQ  v2, v0;        \
    71  	PADDQ  v3, v1;        \
    72  	PXOR   v0, v6;        \
    73  	PXOR   v1, v7;        \
    74  	PSHUFD $0xB1, v6, v6; \
    75  	PSHUFD $0xB1, v7, v7; \
    76  	PADDQ  v6, v4;        \
    77  	PADDQ  v7, v5;        \
    78  	PXOR   v4, v2;        \
    79  	PXOR   v5, v3;        \
    80  	PSHUFB c40, v2;       \
    81  	PSHUFB c40, v3;       \
    82  	PADDQ  m2, v0;        \
    83  	PADDQ  m3, v1;        \
    84  	PADDQ  v2, v0;        \
    85  	PADDQ  v3, v1;        \
    86  	PXOR   v0, v6;        \
    87  	PXOR   v1, v7;        \
    88  	PSHUFB c48, v6;       \
    89  	PSHUFB c48, v7;       \
    90  	PADDQ  v6, v4;        \
    91  	PADDQ  v7, v5;        \
    92  	PXOR   v4, v2;        \
    93  	PXOR   v5, v3;        \
    94  	MOVOU  v2, t0;        \
    95  	PADDQ  v2, t0;        \
    96  	PSRLQ  $63, v2;       \
    97  	PXOR   t0, v2;        \
    98  	MOVOU  v3, t0;        \
    99  	PADDQ  v3, t0;        \
   100  	PSRLQ  $63, v3;       \
   101  	PXOR   t0, v3
   102  
   103  #define LOAD_MSG(m0, m1, m2, m3, i0, i1, i2, i3, i4, i5, i6, i7) \
   104  	MOVQ   i0*8(SI), m0;     \
   105  	PINSRQ $1, i1*8(SI), m0; \
   106  	MOVQ   i2*8(SI), m1;     \
   107  	PINSRQ $1, i3*8(SI), m1; \
   108  	MOVQ   i4*8(SI), m2;     \
   109  	PINSRQ $1, i5*8(SI), m2; \
   110  	MOVQ   i6*8(SI), m3;     \
   111  	PINSRQ $1, i7*8(SI), m3
   112  
   113  // func fSSE4(h *[8]uint64, m *[16]uint64, c0, c1 uint64, flag uint64, rounds uint64)
   114  TEXT ·fSSE4(SB), 4, $24-48 // frame size = 8 + 16 byte alignment
   115  	MOVQ h+0(FP), AX
   116  	MOVQ m+8(FP), SI
   117  	MOVQ c0+16(FP), R8
   118  	MOVQ c1+24(FP), R9
   119  	MOVQ flag+32(FP), CX
   120  	MOVQ rounds+40(FP), BX
   121  
   122  	MOVQ SP, BP
   123  	MOVQ SP, R10
   124  	ADDQ $15, R10
   125  	ANDQ $~15, R10
   126  	MOVQ R10, SP
   127  
   128  	MOVOU ·iv3<>(SB), X0
   129  	MOVO  X0, 0(SP)
   130  	XORQ  CX, 0(SP)     // 0(SP) = ·iv3 ^ (CX || 0)
   131  
   132  	MOVOU ·c40<>(SB), X13
   133  	MOVOU ·c48<>(SB), X14
   134  
   135  	MOVOU 0(AX), X12
   136  	MOVOU 16(AX), X15
   137  
   138  	MOVQ R8, X8
   139  	PINSRQ $1, R9, X8
   140  
   141  	MOVO X12, X0
   142  	MOVO X15, X1
   143  	MOVOU 32(AX), X2
   144  	MOVOU 48(AX), X3
   145  	MOVOU ·iv0<>(SB), X4
   146  	MOVOU ·iv1<>(SB), X5
   147  	MOVOU ·iv2<>(SB), X6
   148  
   149  	PXOR X8, X6
   150  	MOVO 0(SP), X7
   151  
   152  loop:
   153  	SUBQ $1, BX; JCS done
   154  	LOAD_MSG(X8, X9, X10, X11, 0, 2, 4, 6, 1, 3, 5, 7)
   155  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   156  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   157  	LOAD_MSG(X8, X9, X10, X11, 8, 10, 12, 14, 9, 11, 13, 15)
   158  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   159  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   160  
   161  	SUBQ $1, BX; JCS done
   162  	LOAD_MSG(X8, X9, X10, X11, 14, 4, 9, 13, 10, 8, 15, 6)
   163  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   164  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   165  	LOAD_MSG(X8, X9, X10, X11, 1, 0, 11, 5, 12, 2, 7, 3)
   166  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   167  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   168  
   169  	SUBQ $1, BX; JCS done
   170  	LOAD_MSG(X8, X9, X10, X11, 11, 12, 5, 15, 8, 0, 2, 13)
   171  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   172  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   173  	LOAD_MSG(X8, X9, X10, X11, 10, 3, 7, 9, 14, 6, 1, 4)
   174  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   175  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   176  
   177  	SUBQ $1, BX; JCS done
   178  	LOAD_MSG(X8, X9, X10, X11, 7, 3, 13, 11, 9, 1, 12, 14)
   179  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   180  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   181  	LOAD_MSG(X8, X9, X10, X11, 2, 5, 4, 15, 6, 10, 0, 8)
   182  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   183  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   184  
   185  	SUBQ $1, BX; JCS done
   186  	LOAD_MSG(X8, X9, X10, X11, 9, 5, 2, 10, 0, 7, 4, 15)
   187  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   188  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   189  	LOAD_MSG(X8, X9, X10, X11, 14, 11, 6, 3, 1, 12, 8, 13)
   190  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   191  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   192  
   193  	SUBQ $1, BX; JCS done
   194  	LOAD_MSG(X8, X9, X10, X11, 2, 6, 0, 8, 12, 10, 11, 3)
   195  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   196  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   197  	LOAD_MSG(X8, X9, X10, X11, 4, 7, 15, 1, 13, 5, 14, 9)
   198  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   199  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   200  
   201  	SUBQ $1, BX; JCS done
   202  	LOAD_MSG(X8, X9, X10, X11, 12, 1, 14, 4, 5, 15, 13, 10)
   203  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   204  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   205  	LOAD_MSG(X8, X9, X10, X11, 0, 6, 9, 8, 7, 3, 2, 11)
   206  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   207  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   208  
   209  	SUBQ $1, BX; JCS done
   210  	LOAD_MSG(X8, X9, X10, X11, 13, 7, 12, 3, 11, 14, 1, 9)
   211  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   212  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   213  	LOAD_MSG(X8, X9, X10, X11, 5, 15, 8, 2, 0, 4, 6, 10)
   214  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   215  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   216  
   217  	SUBQ $1, BX; JCS done
   218  	LOAD_MSG(X8, X9, X10, X11, 6, 14, 11, 0, 15, 9, 3, 8)
   219  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   220  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   221  	LOAD_MSG(X8, X9, X10, X11, 12, 13, 1, 10, 2, 7, 4, 5)
   222  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   223  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   224  
   225  	SUBQ $1, BX; JCS done
   226  	LOAD_MSG(X8, X9, X10, X11, 10, 8, 7, 1, 2, 4, 6, 5)
   227  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   228  	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
   229  	LOAD_MSG(X8, X9, X10, X11, 15, 9, 3, 13, 11, 14, 12, 0)
   230  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
   231  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
   232  
   233  	JMP loop
   234  
   235  done:
   236  	MOVOU 32(AX), X10
   237  	MOVOU 48(AX), X11
   238  	PXOR  X0, X12
   239  	PXOR  X1, X15
   240  	PXOR  X2, X10
   241  	PXOR  X3, X11
   242  	PXOR  X4, X12
   243  	PXOR  X5, X15
   244  	PXOR  X6, X10
   245  	PXOR  X7, X11
   246  	MOVOU X10, 32(AX)
   247  	MOVOU X11, 48(AX)
   248  
   249  	MOVOU X12, 0(AX)
   250  	MOVOU X15, 16(AX)
   251  
   252  	MOVQ BP, SP
   253  	RET