git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/chacha/chacha_386.s (about)

     1  // Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
     2  // Use of this source code is governed by a license that can be
     3  // found in the LICENSE file.
     4  
     5  // +build 386,!gccgo,!appengine,!nacl
     6  
     7  #include "const.s"
     8  #include "macro.s"
     9  
    10  // FINALIZE xors len bytes from src and block using
    11  // the temp. registers t0 and t1 and writes the result
    12  // to dst.
    13  #define FINALIZE(dst, src, block, len, t0, t1) \
    14  	XORL t0, t0;       \
    15  	XORL t1, t1;       \
    16  	FINALIZE_LOOP:;    \
    17  	MOVB 0(src), t0;   \
    18  	MOVB 0(block), t1; \
    19  	XORL t0, t1;       \
    20  	MOVB t1, 0(dst);   \
    21  	INCL src;          \
    22  	INCL block;        \
    23  	INCL dst;          \
    24  	DECL len;          \
    25  	JG   FINALIZE_LOOP \
    26  
    27  #define Dst DI
    28  #define Nonce AX
    29  #define Key BX
    30  #define Rounds DX
    31  
    32  // func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
    33  TEXT ·hChaCha20SSE2(SB), 4, $0-12
    34  	MOVL out+0(FP), Dst
    35  	MOVL nonce+4(FP), Nonce
    36  	MOVL key+8(FP), Key
    37  
    38  	MOVOU ·sigma<>(SB), X0
    39  	MOVOU 0*16(Key), X1
    40  	MOVOU 1*16(Key), X2
    41  	MOVOU 0*16(Nonce), X3
    42  	MOVL  $20, Rounds
    43  
    44  chacha_loop:
    45  	CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
    46  	CHACHA_SHUFFLE_SSE(X1, X2, X3)
    47  	CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
    48  	CHACHA_SHUFFLE_SSE(X3, X2, X1)
    49  	SUBL $2, Rounds
    50  	JNZ  chacha_loop
    51  
    52  	MOVOU X0, 0*16(Dst)
    53  	MOVOU X3, 1*16(Dst)
    54  	RET
    55  
    56  // func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
    57  TEXT ·hChaCha20SSSE3(SB), 4, $0-12
    58  	MOVL out+0(FP), Dst
    59  	MOVL nonce+4(FP), Nonce
    60  	MOVL key+8(FP), Key
    61  
    62  	MOVOU ·sigma<>(SB), X0
    63  	MOVOU 0*16(Key), X1
    64  	MOVOU 1*16(Key), X2
    65  	MOVOU 0*16(Nonce), X3
    66  	MOVL  $20, Rounds
    67  
    68  	MOVOU ·rol16<>(SB), X5
    69  	MOVOU ·rol8<>(SB), X6
    70  
    71  chacha_loop:
    72  	CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
    73  	CHACHA_SHUFFLE_SSE(X1, X2, X3)
    74  	CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
    75  	CHACHA_SHUFFLE_SSE(X3, X2, X1)
    76  	SUBL $2, Rounds
    77  	JNZ  chacha_loop
    78  
    79  	MOVOU X0, 0*16(Dst)
    80  	MOVOU X3, 1*16(Dst)
    81  	RET
    82  
    83  #undef Dst
    84  #undef Nonce
    85  #undef Key
    86  #undef Rounds
    87  
    88  #define State AX
    89  #define Dst DI
    90  #define Src SI
    91  #define Len DX
    92  #define Tmp0 BX
    93  #define Tmp1 BP
    94  
    95  // func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
    96  TEXT ·xorKeyStreamSSE2(SB), 4, $0-40
    97  	MOVL dst_base+0(FP), Dst
    98  	MOVL src_base+12(FP), Src
    99  	MOVL state+28(FP), State
   100  	MOVL src_len+16(FP), Len
   101  	MOVL $0, ret+36(FP)       // Number of bytes written to the keystream buffer - 0 iff len mod 64 == 0
   102  
   103  	MOVOU 0*16(State), X0
   104  	MOVOU 1*16(State), X1
   105  	MOVOU 2*16(State), X2
   106  	MOVOU 3*16(State), X3
   107  	TESTL Len, Len
   108  	JZ    DONE
   109  
   110  GENERATE_KEYSTREAM:
   111  	MOVO X0, X4
   112  	MOVO X1, X5
   113  	MOVO X2, X6
   114  	MOVO X3, X7
   115  	MOVL rounds+32(FP), Tmp0
   116  
   117  CHACHA_LOOP:
   118  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
   119  	CHACHA_SHUFFLE_SSE(X5, X6, X7)
   120  	CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
   121  	CHACHA_SHUFFLE_SSE(X7, X6, X5)
   122  	SUBL $2, Tmp0
   123  	JA   CHACHA_LOOP
   124  
   125  	MOVOU 0*16(State), X0 // Restore X0 from state
   126  	PADDL X0, X4
   127  	PADDL X1, X5
   128  	PADDL X2, X6
   129  	PADDL X3, X7
   130  	MOVOU ·one<>(SB), X0
   131  	PADDQ X0, X3
   132  
   133  	CMPL Len, $64
   134  	JL   BUFFER_KEYSTREAM
   135  
   136  	XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0)
   137  	MOVOU 0*16(State), X0    // Restore X0 from state
   138  	ADDL  $64, Src
   139  	ADDL  $64, Dst
   140  	SUBL  $64, Len
   141  	JZ    DONE
   142  	JMP   GENERATE_KEYSTREAM // There is at least one more plaintext byte
   143  
   144  BUFFER_KEYSTREAM:
   145  	MOVL  block+24(FP), State
   146  	MOVOU X4, 0(State)
   147  	MOVOU X5, 16(State)
   148  	MOVOU X6, 32(State)
   149  	MOVOU X7, 48(State)
   150  	MOVL  Len, ret+36(FP)     // Number of bytes written to the keystream buffer - 0 < Len < 64
   151  	FINALIZE(Dst, Src, State, Len, Tmp0, Tmp1)
   152  
   153  DONE:
   154  	MOVL  state+28(FP), State
   155  	MOVOU X3, 3*16(State)
   156  	RET
   157  
   158  #undef State
   159  #undef Dst
   160  #undef Src
   161  #undef Len
   162  #undef Tmp0
   163  #undef Tmp1