storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/argon2/blamka_amd64.s (about)

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!gccgo,!appengine
     6  
     7  #include "textflag.h"
     8  
     9  DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
    10  DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
    11  GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
    12  
    13  DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
    14  DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
    15  GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
    16  
    17  #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
    18  	MOVO       v4, t1; \
    19  	MOVO       v5, v4; \
    20  	MOVO       t1, v5; \
    21  	MOVO       v6, t1; \
    22  	PUNPCKLQDQ v6, t2; \
    23  	PUNPCKHQDQ v7, v6; \
    24  	PUNPCKHQDQ t2, v6; \
    25  	PUNPCKLQDQ v7, t2; \
    26  	MOVO       t1, v7; \
    27  	MOVO       v2, t1; \
    28  	PUNPCKHQDQ t2, v7; \
    29  	PUNPCKLQDQ v3, t2; \
    30  	PUNPCKHQDQ t2, v2; \
    31  	PUNPCKLQDQ t1, t2; \
    32  	PUNPCKHQDQ t2, v3
    33  
    34  #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
    35  	MOVO       v4, t1; \
    36  	MOVO       v5, v4; \
    37  	MOVO       t1, v5; \
    38  	MOVO       v2, t1; \
    39  	PUNPCKLQDQ v2, t2; \
    40  	PUNPCKHQDQ v3, v2; \
    41  	PUNPCKHQDQ t2, v2; \
    42  	PUNPCKLQDQ v3, t2; \
    43  	MOVO       t1, v3; \
    44  	MOVO       v6, t1; \
    45  	PUNPCKHQDQ t2, v3; \
    46  	PUNPCKLQDQ v7, t2; \
    47  	PUNPCKHQDQ t2, v6; \
    48  	PUNPCKLQDQ t1, t2; \
    49  	PUNPCKHQDQ t2, v7
    50  
    51  #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
    52  	MOVO    v0, t0;        \
    53  	PMULULQ v2, t0;        \
    54  	PADDQ   v2, v0;        \
    55  	PADDQ   t0, v0;        \
    56  	PADDQ   t0, v0;        \
    57  	PXOR    v0, v6;        \
    58  	PSHUFD  $0xB1, v6, v6; \
    59  	MOVO    v4, t0;        \
    60  	PMULULQ v6, t0;        \
    61  	PADDQ   v6, v4;        \
    62  	PADDQ   t0, v4;        \
    63  	PADDQ   t0, v4;        \
    64  	PXOR    v4, v2;        \
    65  	PSHUFB  c40, v2;       \
    66  	MOVO    v0, t0;        \
    67  	PMULULQ v2, t0;        \
    68  	PADDQ   v2, v0;        \
    69  	PADDQ   t0, v0;        \
    70  	PADDQ   t0, v0;        \
    71  	PXOR    v0, v6;        \
    72  	PSHUFB  c48, v6;       \
    73  	MOVO    v4, t0;        \
    74  	PMULULQ v6, t0;        \
    75  	PADDQ   v6, v4;        \
    76  	PADDQ   t0, v4;        \
    77  	PADDQ   t0, v4;        \
    78  	PXOR    v4, v2;        \
    79  	MOVO    v2, t0;        \
    80  	PADDQ   v2, t0;        \
    81  	PSRLQ   $63, v2;       \
    82  	PXOR    t0, v2;        \
    83  	MOVO    v1, t0;        \
    84  	PMULULQ v3, t0;        \
    85  	PADDQ   v3, v1;        \
    86  	PADDQ   t0, v1;        \
    87  	PADDQ   t0, v1;        \
    88  	PXOR    v1, v7;        \
    89  	PSHUFD  $0xB1, v7, v7; \
    90  	MOVO    v5, t0;        \
    91  	PMULULQ v7, t0;        \
    92  	PADDQ   v7, v5;        \
    93  	PADDQ   t0, v5;        \
    94  	PADDQ   t0, v5;        \
    95  	PXOR    v5, v3;        \
    96  	PSHUFB  c40, v3;       \
    97  	MOVO    v1, t0;        \
    98  	PMULULQ v3, t0;        \
    99  	PADDQ   v3, v1;        \
   100  	PADDQ   t0, v1;        \
   101  	PADDQ   t0, v1;        \
   102  	PXOR    v1, v7;        \
   103  	PSHUFB  c48, v7;       \
   104  	MOVO    v5, t0;        \
   105  	PMULULQ v7, t0;        \
   106  	PADDQ   v7, v5;        \
   107  	PADDQ   t0, v5;        \
   108  	PADDQ   t0, v5;        \
   109  	PXOR    v5, v3;        \
   110  	MOVO    v3, t0;        \
   111  	PADDQ   v3, t0;        \
   112  	PSRLQ   $63, v3;       \
   113  	PXOR    t0, v3
   114  
   115  #define LOAD_MSG_0(block, off) \
   116  	MOVOU 8*(off+0)(block), X0;  \
   117  	MOVOU 8*(off+2)(block), X1;  \
   118  	MOVOU 8*(off+4)(block), X2;  \
   119  	MOVOU 8*(off+6)(block), X3;  \
   120  	MOVOU 8*(off+8)(block), X4;  \
   121  	MOVOU 8*(off+10)(block), X5; \
   122  	MOVOU 8*(off+12)(block), X6; \
   123  	MOVOU 8*(off+14)(block), X7
   124  
   125  #define STORE_MSG_0(block, off) \
   126  	MOVOU X0, 8*(off+0)(block);  \
   127  	MOVOU X1, 8*(off+2)(block);  \
   128  	MOVOU X2, 8*(off+4)(block);  \
   129  	MOVOU X3, 8*(off+6)(block);  \
   130  	MOVOU X4, 8*(off+8)(block);  \
   131  	MOVOU X5, 8*(off+10)(block); \
   132  	MOVOU X6, 8*(off+12)(block); \
   133  	MOVOU X7, 8*(off+14)(block)
   134  
   135  #define LOAD_MSG_1(block, off) \
   136  	MOVOU 8*off+0*8(block), X0;  \
   137  	MOVOU 8*off+16*8(block), X1; \
   138  	MOVOU 8*off+32*8(block), X2; \
   139  	MOVOU 8*off+48*8(block), X3; \
   140  	MOVOU 8*off+64*8(block), X4; \
   141  	MOVOU 8*off+80*8(block), X5; \
   142  	MOVOU 8*off+96*8(block), X6; \
   143  	MOVOU 8*off+112*8(block), X7
   144  
   145  #define STORE_MSG_1(block, off) \
   146  	MOVOU X0, 8*off+0*8(block);  \
   147  	MOVOU X1, 8*off+16*8(block); \
   148  	MOVOU X2, 8*off+32*8(block); \
   149  	MOVOU X3, 8*off+48*8(block); \
   150  	MOVOU X4, 8*off+64*8(block); \
   151  	MOVOU X5, 8*off+80*8(block); \
   152  	MOVOU X6, 8*off+96*8(block); \
   153  	MOVOU X7, 8*off+112*8(block)
   154  
   155  #define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
   156  	LOAD_MSG_0(block, off);                                   \
   157  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
   158  	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
   159  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
   160  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
   161  	STORE_MSG_0(block, off)
   162  
   163  #define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
   164  	LOAD_MSG_1(block, off);                                   \
   165  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
   166  	SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1);                  \
   167  	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
   168  	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1);              \
   169  	STORE_MSG_1(block, off)
   170  
   171  // func blamkaSSE4(b *block)
   172  TEXT ·blamkaSSE4(SB), 4, $0-8
   173  	MOVQ b+0(FP), AX
   174  
   175  	MOVOU ·c40<>(SB), X10
   176  	MOVOU ·c48<>(SB), X11
   177  
   178  	BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
   179  	BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
   180  	BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
   181  	BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
   182  	BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
   183  	BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
   184  	BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
   185  	BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
   186  
   187  	BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
   188  	BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
   189  	BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
   190  	BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
   191  	BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
   192  	BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
   193  	BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
   194  	BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
   195  	RET
   196  
   197  // func mixBlocksSSE2(out, a, b, c *block)
   198  TEXT ·mixBlocksSSE2(SB), 4, $0-32
   199  	MOVQ out+0(FP), DX
   200  	MOVQ a+8(FP), AX
   201  	MOVQ b+16(FP), BX
   202  	MOVQ c+24(FP), CX
   203  	MOVQ $128, SI
   204  
   205  loop:
   206  	MOVOU 0(AX), X0
   207  	MOVOU 0(BX), X1
   208  	MOVOU 0(CX), X2
   209  	PXOR  X1, X0
   210  	PXOR  X2, X0
   211  	MOVOU X0, 0(DX)
   212  	ADDQ  $16, AX
   213  	ADDQ  $16, BX
   214  	ADDQ  $16, CX
   215  	ADDQ  $16, DX
   216  	SUBQ  $2, SI
   217  	JA    loop
   218  	RET
   219  
   220  // func xorBlocksSSE2(out, a, b, c *block)
   221  TEXT ·xorBlocksSSE2(SB), 4, $0-32
   222  	MOVQ out+0(FP), DX
   223  	MOVQ a+8(FP), AX
   224  	MOVQ b+16(FP), BX
   225  	MOVQ c+24(FP), CX
   226  	MOVQ $128, SI
   227  
   228  loop:
   229  	MOVOU 0(AX), X0
   230  	MOVOU 0(BX), X1
   231  	MOVOU 0(CX), X2
   232  	MOVOU 0(DX), X3
   233  	PXOR  X1, X0
   234  	PXOR  X2, X0
   235  	PXOR  X3, X0
   236  	MOVOU X0, 0(DX)
   237  	ADDQ  $16, AX
   238  	ADDQ  $16, BX
   239  	ADDQ  $16, CX
   240  	ADDQ  $16, DX
   241  	SUBQ  $2, SI
   242  	JA    loop
   243  	RET