github.com/mad-day/Yawning-crypto@v0.0.0-20190711051033-5a5f8cca32ec/morus/hwaccel_amd64.s (about)

     1  // +build !noasm,go1.10
     2  // hwaccel_amd64.s - AMD64 optimized routines
     3  //
     4  // To the extent possible under law, Yawning Angel has waived all copyright
     5  // and related or neighboring rights to the software, using the Creative
     6  // Commons "CC0" public domain dedication. See LICENSE or
     7  // <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
     8  
     9  #include "textflag.h"
    10  
    11  // func cpuidAmd64(cpuidParams *uint32)
    12  TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
    13  	MOVQ cpuidParams+0(FP), R15
    14  	MOVL 0(R15), AX
    15  	MOVL 8(R15), CX
    16  	CPUID
    17  	MOVL AX, 0(R15)
    18  	MOVL BX, 4(R15)
    19  	MOVL CX, 8(R15)
    20  	MOVL DX, 12(R15)
    21  	RET
    22  
    23  // func xgetbv0Amd64(xcrVec *uint32)
    24  TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
    25  	MOVQ xcrVec+0(FP), BX
    26  	XORL CX, CX
    27  	XGETBV
    28  	MOVL AX, 0(BX)
    29  	MOVL DX, 4(BX)
    30  	RET
    31  
    32  // Some useful macros for loading/storing the state, and the state update
    33  // function, along with aliases for the registers used for readability.
    34  
    35  // YMM Registers: Sx -> State, Mx -> Message, Tx -> Temporary
    36  // GP Registers: RAX, RBX, RCX -> Temporary
    37  #define S0 Y0
    38  #define S1 Y1
    39  #define S2 Y2
    40  #define S3 Y3
    41  #define S4 Y4
    42  #define M0 Y5
    43  #define T0 Y14
    44  #define T1 Y15
    45  
    46  // This essentially naively translated from the intrinsics, but neither GCC nor
    47  // clang's idea of what this should be appears to be better on Broadwell, and
    48  // there is a benefit to being easy to cross reference with the upstream
    49  // implementation.
    50  #define STATE_UPDATE() \
    51  	VPXOR  S0, S3, S0    \
    52  	VPAND  S1, S2, T0    \
    53  	VPXOR  S0, T0, S0    \
    54  	VPSLLQ $13, S0, T0   \
    55  	VPSRLQ $51, S0, T1   \
    56  	VPOR   T0, T1, S0    \
    57  	VPERMQ $-109, S3, S3 \
    58  	                     \
    59  	VPXOR  S1, M0, S1    \
    60  	VPXOR  S1, S4, S1    \
    61  	VPAND  S2, S3, T0    \
    62  	VPXOR  S1, T0, S1    \
    63  	VPSLLQ $46, S1, T0   \
    64  	VPSRLQ $18, S1, T1   \
    65  	VPOR   T0, T1, S1    \
    66  	VPERMQ $78, S4, S4   \
    67  	                     \
    68  	VPXOR  S2, M0, S2    \
    69  	VPXOR  S2, S0, S2    \
    70  	VPAND  S3, S4, T0    \
    71  	VPXOR  S2, T0, S2    \
    72  	VPSLLQ $38, S2, T0   \
    73  	VPSRLQ $26, S2, T1   \
    74  	VPOR   T0, T1, S2    \
    75  	VPERMQ $57, S0, S0   \
    76  	                     \
    77  	VPXOR  S3, M0, S3    \
    78  	VPXOR  S3, S1, S3    \
    79  	VPAND  S4, S0, T0    \
    80  	VPXOR  S3, T0, S3    \
    81  	VPSLLQ $7, S3, T0    \
    82  	VPSRLQ $57, S3, T1   \
    83  	VPOR   T0, T1, S3    \
    84  	VPERMQ $78, S1, S1   \
    85  	                     \
    86  	VPXOR  S4, M0, S4    \
    87  	VPXOR  S4, S2, S4    \
    88  	VPAND  S0, S1, T0    \
    89  	VPXOR  S4, T0, S4    \
    90  	VPSLLQ $4, S4, T0    \
    91  	VPSRLQ $60, S4, T1   \
    92  	VPOR   T0, T1, S4    \
    93  	VPERMQ $-109, S2, S2
    94  
    95  #define COPY(DST, SRC, LEN) \
    96  	MOVQ SRC, SI \
    97  	MOVQ DST, DI \
    98  	MOVQ LEN, CX \
    99  	REP          \
   100  	MOVSB
   101  
   102  #define INIT_STATE(IV, KEY) \
   103  	VPXOR     S0, S0, S0                       \
   104  	MOVOU     (IV), X0                         \
   105  	VMOVDQU   (KEY), S1                        \
   106  	VPCMPEQD  S2, S2, S2                       \
   107  	VPXOR     S3, S3, S3                       \
   108  	VMOVDQU   ·initializationConstants(SB), S4 \
   109  	VPXOR     M0, M0, M0                       \
   110  	VMOVDQA   S1, Y6                           \
   111  	MOVQ      $16, AX                          \
   112  	                                           \
   113  initLoop:                                    \
   114  	STATE_UPDATE()                             \
   115  	SUBQ      $1, AX                           \
   116  	JNZ       initLoop                         \
   117  	                                           \
   118  	VPXOR     Y6, S1, S1
   119  
   120  #define ABSORB_BLOCKS(A, ALEN, SCRATCH) \
   121  	MOVQ            ALEN, AX       \
   122  	SHRQ            $5, AX         \
   123  	JZ              absorbPartial  \
   124  loopAbsorbFull:                  \
   125  	VMOVDQU         (A), M0        \
   126  	STATE_UPDATE()                 \
   127  	ADDQ            $32, A         \
   128  	SUBQ            $1, AX         \
   129  	JNZ             loopAbsorbFull \
   130  absorbPartial:                   \
   131  	ANDQ            $31, ALEN      \
   132  	JZ              absorbDone     \
   133  	COPY(SCRATCH, A, ALEN)         \
   134  	VMOVDQU         (SCRATCH), M0  \
   135  	STATE_UPDATE()                 \
   136  absorbDone:
   137  
   138  #define FINALIZE(TAG, ALEN, MLEN, SCRATCH) \
   139  	SHLQ       $3, ALEN         \
   140  	MOVQ       ALEN, (SCRATCH)  \
   141  	SHLQ       $3, MLEN         \
   142  	MOVQ       MLEN, 8(SCRATCH) \
   143  	                            \
   144  	VPXOR      S4, S0, S4       \
   145  	VMOVDQU    (SCRATCH), M0    \
   146  	                            \
   147  	MOVQ       $10, AX          \
   148  loopFinal:                    \
   149  	STATE_UPDATE()              \
   150  	SUBQ       $1, AX           \
   151  	JNZ        loopFinal        \
   152  	                            \
   153  	VPERMQ     $57, S1, Y6      \
   154  	VPXOR      S0, Y6, Y6       \
   155  	VPAND      S2, S3, Y7       \
   156  	VPXOR      Y6, Y7, Y7       \
   157  	MOVOU      X7, (TAG)
   158  
   159  // func aeadEncryptAVX2(c, m, a []byte, nonce, key *byte)
   160  TEXT ·aeadEncryptAVX2(SB), NOSPLIT, $32-88
   161  	MOVQ    SP, R15
   162  	VPXOR   Y13, Y13, Y13
   163  	VMOVDQU Y13, (R15)
   164  	CLD
   165  
   166  	// Initialize the state.
   167  	MOVQ nonce+72(FP), R8
   168  	MOVQ key+80(FP), R9
   169  	INIT_STATE(R8, R9)
   170  
   171  	// Absorb the AD.
   172  	MOVQ a+48(FP), R8 // &a[0] -> R8
   173  	MOVQ a+56(FP), R9 // len(a) -> R9
   174  	ABSORB_BLOCKS(R8, R9, R15)
   175  
   176  	// Encrypt the data.
   177  	MOVQ m+24(FP), R8 // &m[0] -> R8
   178  	MOVQ m+32(FP), R9 // len(m) -> R9
   179  	MOVQ c+0(FP), R10 // &c[0] -> R10
   180  
   181  	MOVQ R9, AX
   182  	SHRQ $5, AX
   183  	JZ   encryptPartial
   184  
   185  loopEncryptFull:
   186  	VMOVDQU (R8), M0
   187  	VPERMQ  $57, S1, Y6
   188  	VPXOR   S0, Y6, Y6
   189  	VPAND   S2, S3, Y7
   190  	VPXOR   Y6, Y7, Y6
   191  	VPXOR   M0, Y6, Y6
   192  	VMOVDQU Y6, (R10)
   193  	STATE_UPDATE()
   194  	ADDQ    $32, R8
   195  	ADDQ    $32, R10
   196  	SUBQ    $1, AX
   197  	JNZ     loopEncryptFull
   198  
   199  encryptPartial:
   200  	ANDQ    $31, R9
   201  	JZ      encryptDone
   202  	VMOVDQU Y13, (R15)
   203  	COPY(R15, R8, R9)
   204  	VMOVDQU (R15), M0
   205  	VPERMQ  $57, S1, Y6
   206  	VPXOR   S0, Y6, Y6
   207  	VPAND   S2, S3, Y7
   208  	VPXOR   Y6, Y7, Y6
   209  	VPXOR   M0, Y6, Y6
   210  	VMOVDQU Y6, (R15)
   211  	STATE_UPDATE()
   212  	COPY(R10, R15, R9)
   213  	ADDQ    R9, R10
   214  
   215  encryptDone:
   216  
   217  	// Finalize and write the tag.
   218  	MOVQ    a+56(FP), R8 // len(a) -> R8
   219  	MOVQ    m+32(FP), R9 // len(m) -> R9
   220  	VMOVDQU Y13, (R15)
   221  	FINALIZE(R10, R8, R9, R15)
   222  
   223  	VMOVDQU Y13, (R15)
   224  	VZEROUPPER
   225  	RET
   226  
   227  // func aeadDecryptAVX2(m, c, a []byte, nonce, key, tag *byte)
   228  TEXT ·aeadDecryptAVX2(SB), NOSPLIT, $32-96
   229  	MOVQ    SP, R15
   230  	VPXOR   Y13, Y13, Y13
   231  	VMOVDQU Y13, (R15)
   232  	CLD
   233  
   234  	// Initialize the state.
   235  	MOVQ nonce+72(FP), R8
   236  	MOVQ key+80(FP), R9
   237  	INIT_STATE(R8, R9)
   238  
   239  	// Absorb the AD.
   240  	MOVQ a+48(FP), R8 // &a[0] -> R8
   241  	MOVQ a+56(FP), R9 // len(a) -> R9
   242  	ABSORB_BLOCKS(R8, R9, R15)
   243  
   244  	// Decrypt the data.
   245  	MOVQ c+24(FP), R8 // &c[0] -> R8
   246  	MOVQ c+32(FP), R9 // len(c) -> R9
   247  	MOVQ m+0(FP), R10 // &m[0] -> R10
   248  
   249  	MOVQ R9, AX
   250  	SHRQ $5, AX
   251  	JZ   decryptPartial
   252  
   253  loopDecryptFull:
   254  	VMOVDQU (R8), M0
   255  	VPERMQ  $57, S1, Y6
   256  	VPXOR   S0, Y6, Y6
   257  	VPAND   S2, S3, Y7
   258  	VPXOR   Y6, Y7, Y6
   259  	VPXOR   M0, Y6, M0
   260  	VMOVDQU M0, (R10)
   261  	STATE_UPDATE()
   262  	ADDQ    $32, R8
   263  	ADDQ    $32, R10
   264  	SUBQ    $1, AX
   265  	JNZ     loopDecryptFull
   266  
   267  decryptPartial:
   268  	ANDQ    $31, R9
   269  	JZ      decryptDone
   270  	VMOVDQU Y13, (R15)
   271  	COPY(R15, R8, R9)
   272  	VMOVDQU (R15), M0
   273  	VPERMQ  $57, S1, Y6
   274  	VPXOR   S0, Y6, Y6
   275  	VPAND   S2, S3, Y7
   276  	VPXOR   Y6, Y7, Y6
   277  	VPXOR   M0, Y6, M0
   278  	VMOVDQU M0, (R15)
   279  	COPY(R10, R15, R9)
   280  	MOVQ    $0, AX
   281  	MOVQ    R15, DI
   282  	MOVQ    $32, CX
   283  	SUBQ    R9, CX
   284  	ADDQ    R9, DI
   285  	REP
   286  	STOSB
   287  	VMOVDQU (R15), M0
   288  	STATE_UPDATE()
   289  
   290  decryptDone:
   291  
   292  	// Finalize and write the tag.
   293  	MOVQ    a+56(FP), R8    // len(a) -> R8
   294  	MOVQ    m+32(FP), R9    // len(m) -> R9
   295  	MOVQ    tag+88(FP), R14 // tag -> R14
   296  	VMOVDQU Y13, (R15)
   297  	FINALIZE(R14, R8, R9, R15)
   298  
   299  	VMOVDQU Y13, (R15)
   300  	VZEROUPPER
   301  	RET