git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/chacha/macro.s (about)

     1  // Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
     2  // Use of this source code is governed by a license that can be
     3  // found in the LICENSE file.
     4  
     5  // +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl
     6  
     7  // ROTL_SSE rotates all 4 32 bit values of the XMM register v
     8  // left by n bits using SSE2 instructions (0 <= n <= 32).
     9  // The XMM register t is used as a temp. register.
    10  #define ROTL_SSE(n, t, v) \
    11  	MOVO  v, t;       \
    12  	PSLLL $n, t;      \
    13  	PSRLL $(32-n), v; \
    14  	PXOR  t, v
    15  
    16  // ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v
    17  // left by n bits using AVX/AVX2 instructions (0 <= n <= 32).
    18  // The AVX/AVX2 register t is used as a temp. register.
    19  #define ROTL_AVX(n, t, v) \
    20  	VPSLLD $n, v, t;      \
    21  	VPSRLD $(32-n), v, v; \
    22  	VPXOR  v, t, v
    23  
    24  // CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the
    25  // 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for
    26  // rotations. The XMM register t is used as a temp. register.
    27  #define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \
    28  	PADDL v1, v0;        \
    29  	PXOR  v0, v3;        \
    30  	ROTL_SSE(16, t, v3); \
    31  	PADDL v3, v2;        \
    32  	PXOR  v2, v1;        \
    33  	ROTL_SSE(12, t, v1); \
    34  	PADDL v1, v0;        \
    35  	PXOR  v0, v3;        \
    36  	ROTL_SSE(8, t, v3);  \
    37  	PADDL v3, v2;        \
    38  	PXOR  v2, v1;        \
    39  	ROTL_SSE(7, t, v1)
    40  
    41  // CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the
    42  // 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit
    43  // rotations. The XMM register t is used as a temp. register.
    44  //
    45  // r16 holds the PSHUFB constant for a 16 bit left rotate.
    46  // r8 holds the PSHUFB constant for a 8 bit left rotate.
    47  #define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \
    48  	PADDL  v1, v0;       \
    49  	PXOR   v0, v3;       \
    50  	PSHUFB r16, v3;      \
    51  	PADDL  v3, v2;       \
    52  	PXOR   v2, v1;       \
    53  	ROTL_SSE(12, t, v1); \
    54  	PADDL  v1, v0;       \
    55  	PXOR   v0, v3;       \
    56  	PSHUFB r8, v3;       \
    57  	PADDL  v3, v2;       \
    58  	PXOR   v2, v1;       \
    59  	ROTL_SSE(7, t, v1)
    60  
    61  // CHACHA_QROUND_AVX performs a ChaCha quarter-round using the
    62  // 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit
    63  // rotations. The AVX/AVX2 register t is used as a temp. register.
    64  //
    65  // r16 holds the VPSHUFB constant for a 16 bit left rotate.
    66  // r8 holds the VPSHUFB constant for a 8 bit left rotate.
    67  #define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \
    68  	VPADDD  v0, v1, v0;  \
    69  	VPXOR   v3, v0, v3;  \
    70  	VPSHUFB r16, v3, v3; \
    71  	VPADDD  v2, v3, v2;  \
    72  	VPXOR   v1, v2, v1;  \
    73  	ROTL_AVX(12, t, v1); \
    74  	VPADDD  v0, v1, v0;  \
    75  	VPXOR   v3, v0, v3;  \
    76  	VPSHUFB r8, v3, v3;  \
    77  	VPADDD  v2, v3, v2;  \
    78  	VPXOR   v1, v2, v1;  \
    79  	ROTL_AVX(7, t, v1)
    80  
    81  // CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the
    82  // 3 XMM registers v1, v2 and v3. The inverse shuffle is
    83  // performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1).
    84  #define CHACHA_SHUFFLE_SSE(v1, v2, v3) \
    85  	PSHUFL $0x39, v1, v1; \
    86  	PSHUFL $0x4E, v2, v2; \
    87  	PSHUFL $0x93, v3, v3
    88  
    89  // CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the
    90  // 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is
    91  // performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1).
    92  #define CHACHA_SHUFFLE_AVX(v1, v2, v3) \
    93  	VPSHUFD $0x39, v1, v1; \
    94  	VPSHUFD $0x4E, v2, v2; \
    95  	VPSHUFD $0x93, v3, v3
    96  
    97  // XOR_SSE extracts 4x16 byte vectors from src at
    98  // off, xors all vectors with the corresponding XMM
    99  // register (v0 - v3) and writes the result to dst
   100  // at off.
   101  // The XMM register t is used as a temp. register.
   102  #define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \
   103  	MOVOU 0+off(src), t;  \
   104  	PXOR  v0, t;          \
   105  	MOVOU t, 0+off(dst);  \
   106  	MOVOU 16+off(src), t; \
   107  	PXOR  v1, t;          \
   108  	MOVOU t, 16+off(dst); \
   109  	MOVOU 32+off(src), t; \
   110  	PXOR  v2, t;          \
   111  	MOVOU t, 32+off(dst); \
   112  	MOVOU 48+off(src), t; \
   113  	PXOR  v3, t;          \
   114  	MOVOU t, 48+off(dst)
   115  
   116  // XOR_AVX extracts 4x16 byte vectors from src at
   117  // off, xors all vectors with the corresponding AVX
   118  // register (v0 - v3) and writes the result to dst
   119  // at off.
   120  // The XMM register t is used as a temp. register.
   121  #define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \
   122  	VPXOR   0+off(src), v0, t;  \
   123  	VMOVDQU t, 0+off(dst);      \
   124  	VPXOR   16+off(src), v1, t; \
   125  	VMOVDQU t, 16+off(dst);     \
   126  	VPXOR   32+off(src), v2, t; \
   127  	VMOVDQU t, 32+off(dst);     \
   128  	VPXOR   48+off(src), v3, t; \
   129  	VMOVDQU t, 48+off(dst)
   130  
   131  #define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
   132  	VMOVDQU    (0+off)(src), t0;  \
   133  	VPERM2I128 $32, v1, v0, t1;   \
   134  	VPXOR      t0, t1, t0;        \
   135  	VMOVDQU    t0, (0+off)(dst);  \
   136  	VMOVDQU    (32+off)(src), t0; \
   137  	VPERM2I128 $32, v3, v2, t1;   \
   138  	VPXOR      t0, t1, t0;        \
   139  	VMOVDQU    t0, (32+off)(dst); \
   140  	VMOVDQU    (64+off)(src), t0; \
   141  	VPERM2I128 $49, v1, v0, t1;   \
   142  	VPXOR      t0, t1, t0;        \
   143  	VMOVDQU    t0, (64+off)(dst); \
   144  	VMOVDQU    (96+off)(src), t0; \
   145  	VPERM2I128 $49, v3, v2, t1;   \
   146  	VPXOR      t0, t1, t0;        \
   147  	VMOVDQU    t0, (96+off)(dst)
   148  
   149  #define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
   150  	VMOVDQU    (0+off)(src), t0;  \
   151  	VPERM2I128 $32, v1, v0, t1;   \
   152  	VPXOR      t0, t1, t0;        \
   153  	VMOVDQU    t0, (0+off)(dst);  \
   154  	VMOVDQU    (32+off)(src), t0; \
   155  	VPERM2I128 $32, v3, v2, t1;   \
   156  	VPXOR      t0, t1, t0;        \
   157  	VMOVDQU    t0, (32+off)(dst); \
   158  
   159  #define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \
   160  	VPERM2I128 $49, v1, v0, t0; \
   161  	VMOVDQU    t0, 0(dst);      \
   162  	VPERM2I128 $49, v3, v2, t0; \
   163  	VMOVDQU    t0, 32(dst)