github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/aes/gcm_amd64.s (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     6  // The implementation uses some optimization as described in:
     7  // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     8  //     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     9  // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    10  //     Hardware
    11  
    12  #include "textflag.h"
    13  
    14  #define B0 X0
    15  #define B1 X1
    16  #define B2 X2
    17  #define B3 X3
    18  #define B4 X4
    19  #define B5 X5
    20  #define B6 X6
    21  #define B7 X7
    22  
    23  #define ACC0 X8
    24  #define ACC1 X9
    25  #define ACCM X10
    26  
    27  #define T0 X11
    28  #define T1 X12
    29  #define T2 X13
    30  #define POLY X14
    31  #define BSWAP X15
    32  
    33  DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    34  DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    35  
    36  DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    37  DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    38  
    39  DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    40  DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    41  DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    42  DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    43  DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    44  DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    45  DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    46  DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    47  DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    48  DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    49  DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    50  DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    51  DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    52  DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    53  DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    54  DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    55  DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    56  DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    57  DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    58  DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    59  DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    60  DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    61  DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    62  DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    63  DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    64  DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    65  DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    66  DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    67  DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    68  DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    69  
    70  GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    71  GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    72  GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    73  
    74  // func hasGCMAsm() bool
    75  // returns whether AES-NI AND CLMUL-NI are supported
    76  TEXT ·hasGCMAsm(SB),NOSPLIT,$0
    77  	XORQ AX, AX
    78  	INCL AX
    79  	CPUID
    80  	MOVQ CX, DX
    81  	SHRQ $25, CX
    82  	SHRQ $1, DX
    83  	ANDQ DX, CX
    84  	ANDQ $1, CX
    85  	MOVB CX, ret+0(FP)
    86  	RET
    87  
    88  // func aesEncBlock(dst, src *[16]byte, ks []uint32)
    89  TEXT ·aesEncBlock(SB),NOSPLIT,$0
    90  	MOVQ dst+0(FP), DI
    91  	MOVQ src+8(FP), SI
    92  	MOVQ ks_base+16(FP), DX
    93  	MOVQ ks_len+24(FP), CX
    94  
    95  	SHRQ $2, CX
    96  	DECQ CX
    97  
    98  	MOVOU (SI), X0
    99  	MOVOU (16*0)(DX), X1
   100  	PXOR X1, X0
   101  	MOVOU (16*1)(DX), X1
   102  	AESENC X1, X0
   103  	MOVOU (16*2)(DX), X1
   104  	AESENC X1, X0
   105  	MOVOU (16*3)(DX), X1
   106  	AESENC X1, X0
   107  	MOVOU (16*4)(DX), X1
   108  	AESENC X1, X0
   109  	MOVOU (16*5)(DX), X1
   110  	AESENC X1, X0
   111  	MOVOU (16*6)(DX), X1
   112  	AESENC X1, X0
   113  	MOVOU (16*7)(DX), X1
   114  	AESENC X1, X0
   115  	MOVOU (16*8)(DX), X1
   116  	AESENC X1, X0
   117  	MOVOU (16*9)(DX), X1
   118  	AESENC X1, X0
   119  	MOVOU (16*10)(DX), X1
   120  	CMPQ CX, $12
   121  	JB encLast
   122  	AESENC X1, X0
   123  	MOVOU (16*11)(DX), X1
   124  	AESENC X1, X0
   125  	MOVOU (16*12)(DX), X1
   126  	JE encLast
   127  	AESENC X1, X0
   128  	MOVOU (16*13)(DX), X1
   129  	AESENC X1, X0
   130  	MOVOU (16*14)(DX), X1
   131  
   132  encLast:
   133  	AESENCLAST X1, X0
   134  	MOVOU X0, (DI)
   135  
   136  	RET
   137  
   138  // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
   139  TEXT ·gcmAesFinish(SB),NOSPLIT,$0
   140  #define pTbl DI
   141  #define tMsk SI
   142  #define tPtr DX
   143  #define plen AX
   144  #define dlen CX
   145  
   146  	MOVQ productTable+0(FP), pTbl
   147  	MOVQ tagMask+8(FP), tMsk
   148  	MOVQ T+16(FP), tPtr
   149  	MOVQ pLen+24(FP), plen
   150  	MOVQ dLen+32(FP), dlen
   151  
   152  	MOVOU (tPtr), ACC0
   153  	MOVOU (tMsk), T2
   154  
   155  	MOVOU bswapMask<>(SB), BSWAP
   156  	MOVOU gcmPoly<>(SB), POLY
   157  
   158  	SHLQ $3, plen
   159  	SHLQ $3, dlen
   160  
   161  	MOVQ plen, B0
   162  	PINSRQ $1, dlen, B0
   163  
   164  	PXOR ACC0, B0
   165  
   166  	MOVOU (16*14)(pTbl), ACC0
   167  	MOVOU (16*15)(pTbl), ACCM
   168  	MOVOU ACC0, ACC1
   169  
   170  	PCLMULQDQ $0x00, B0, ACC0
   171  	PCLMULQDQ $0x11, B0, ACC1
   172  	PSHUFD $78, B0, T0
   173  	PXOR B0, T0
   174  	PCLMULQDQ $0x00, T0, ACCM
   175  
   176  	PXOR ACC0, ACCM
   177  	PXOR ACC1, ACCM
   178  	MOVOU ACCM, T0
   179  	PSRLDQ $8, ACCM
   180  	PSLLDQ $8, T0
   181  	PXOR ACCM, ACC1
   182  	PXOR T0, ACC0
   183  
   184  	MOVOU POLY, T0
   185  	PCLMULQDQ $0x01, ACC0, T0
   186  	PSHUFD $78, ACC0, ACC0
   187  	PXOR T0, ACC0
   188  
   189  	MOVOU POLY, T0
   190  	PCLMULQDQ $0x01, ACC0, T0
   191  	PSHUFD $78, ACC0, ACC0
   192  	PXOR T0, ACC0
   193  
   194  	PXOR ACC1, ACC0
   195  
   196  	PSHUFB BSWAP, ACC0
   197  	PXOR T2, ACC0
   198  	MOVOU ACC0, (tPtr)
   199  
   200  	RET
   201  #undef pTbl
   202  #undef tMsk
   203  #undef tPtr
   204  #undef plen
   205  #undef dlen
   206  
   207  // func gcmAesInit(productTable *[256]byte, ks []uint32)
   208  TEXT ·gcmAesInit(SB),NOSPLIT,$0
   209  #define dst DI
   210  #define KS SI
   211  #define NR DX
   212  
   213  	MOVQ productTable+0(FP), dst
   214  	MOVQ ks_base+8(FP), KS
   215  	MOVQ ks_len+16(FP), NR
   216  
   217  	SHRQ $2, NR
   218  	DECQ NR
   219  
   220  	MOVOU bswapMask<>(SB), BSWAP
   221  	MOVOU gcmPoly<>(SB), POLY
   222  
   223  	// Encrypt block 0, with the AES key to generate the hash key H
   224  	MOVOU (16*0)(KS), B0
   225  	MOVOU (16*1)(KS), T0
   226  	AESENC T0, B0
   227  	MOVOU (16*2)(KS), T0
   228  	AESENC T0, B0
   229  	MOVOU (16*3)(KS), T0
   230  	AESENC T0, B0
   231  	MOVOU (16*4)(KS), T0
   232  	AESENC T0, B0
   233  	MOVOU (16*5)(KS), T0
   234  	AESENC T0, B0
   235  	MOVOU (16*6)(KS), T0
   236  	AESENC T0, B0
   237  	MOVOU (16*7)(KS), T0
   238  	AESENC T0, B0
   239  	MOVOU (16*8)(KS), T0
   240  	AESENC T0, B0
   241  	MOVOU (16*9)(KS), T0
   242  	AESENC T0, B0
   243  	MOVOU (16*10)(KS), T0
   244  	CMPQ NR, $12
   245  	JB initEncLast
   246  	AESENC T0, B0
   247  	MOVOU (16*11)(KS), T0
   248  	AESENC T0, B0
   249  	MOVOU (16*12)(KS), T0
   250  	JE initEncLast
   251  	AESENC T0, B0
   252  	MOVOU (16*13)(KS), T0
   253  	AESENC T0, B0
   254  	MOVOU (16*14)(KS), T0
   255  initEncLast:
   256  	AESENCLAST T0, B0
   257  
   258  	PSHUFB BSWAP, B0
   259  	// H * 2
   260  	PSHUFD $0xff, B0, T0
   261  	MOVOU B0, T1
   262  	PSRAL $31, T0
   263  	PAND POLY, T0
   264  	PSRLL $31, T1
   265  	PSLLDQ $4, T1
   266  	PSLLL $1, B0
   267  	PXOR T0, B0
   268  	PXOR T1, B0
   269  	// Karatsuba pre-computations
   270  	MOVOU B0, (16*14)(dst)
   271  	PSHUFD $78, B0, B1
   272  	PXOR B0, B1
   273  	MOVOU B1, (16*15)(dst)
   274  
   275  	MOVOU B0, B2
   276  	MOVOU B1, B3
   277  	// Now prepare powers of H and pre-computations for them
   278  	MOVQ $7, AX
   279  
   280  initLoop:
   281  		MOVOU B2, T0
   282  		MOVOU B2, T1
   283  		MOVOU B3, T2
   284  		PCLMULQDQ $0x00, B0, T0
   285  		PCLMULQDQ $0x11, B0, T1
   286  		PCLMULQDQ $0x00, B1, T2
   287  
   288  		PXOR T0, T2
   289  		PXOR T1, T2
   290  		MOVOU T2, B4
   291  		PSLLDQ $8, B4
   292  		PSRLDQ $8, T2
   293  		PXOR B4, T0
   294  		PXOR T2, T1
   295  
   296  		MOVOU POLY, B2
   297  		PCLMULQDQ $0x01, T0, B2
   298  		PSHUFD $78, T0, T0
   299  		PXOR B2, T0
   300  		MOVOU POLY, B2
   301  		PCLMULQDQ $0x01, T0, B2
   302  		PSHUFD $78, T0, T0
   303  		PXOR T0, B2
   304  		PXOR T1, B2
   305  
   306  		MOVOU B2, (16*12)(dst)
   307  		PSHUFD $78, B2, B3
   308  		PXOR B2, B3
   309  		MOVOU B3, (16*13)(dst)
   310  
   311  		DECQ AX
   312  		LEAQ (-16*2)(dst), dst
   313  	JNE initLoop
   314  
   315  	RET
   316  #undef NR
   317  #undef KS
   318  #undef dst
   319  
   320  // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   321  TEXT ·gcmAesData(SB),NOSPLIT,$0
   322  #define pTbl DI
   323  #define aut SI
   324  #define tPtr CX
   325  #define autLen DX
   326  
   327  #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   328  #define mulRoundAAD(X ,i) \
   329  	MOVOU (16*(i*2))(pTbl), T1;\
   330  	MOVOU T1, T2;\
   331  	PCLMULQDQ $0x00, X, T1;\
   332  	PXOR T1, ACC0;\
   333  	PCLMULQDQ $0x11, X, T2;\
   334  	PXOR T2, ACC1;\
   335  	PSHUFD $78, X, T1;\
   336  	PXOR T1, X;\
   337  	MOVOU (16*(i*2+1))(pTbl), T1;\
   338  	PCLMULQDQ $0x00, X, T1;\
   339  	PXOR T1, ACCM
   340  
   341  	MOVQ productTable+0(FP), pTbl
   342  	MOVQ data_base+8(FP), aut
   343  	MOVQ data_len+16(FP), autLen
   344  	MOVQ T+32(FP), tPtr
   345  
   346  	PXOR ACC0, ACC0
   347  	MOVOU bswapMask<>(SB), BSWAP
   348  	MOVOU gcmPoly<>(SB), POLY
   349  
   350  	TESTQ autLen, autLen
   351  	JEQ dataBail
   352  
   353  	CMPQ autLen, $13	// optimize the TLS case
   354  	JE dataTLS
   355  	CMPQ autLen, $128
   356  	JB startSinglesLoop
   357  	JMP dataOctaLoop
   358  
   359  dataTLS:
   360  	MOVOU (16*14)(pTbl), T1
   361  	MOVOU (16*15)(pTbl), T2
   362  	PXOR B0, B0
   363  	MOVQ (aut), B0
   364  	PINSRD $2, 8(aut), B0
   365  	PINSRB $12, 12(aut), B0
   366  	XORQ autLen, autLen
   367  	JMP dataMul
   368  
   369  dataOctaLoop:
   370  		CMPQ autLen, $128
   371  		JB startSinglesLoop
   372  		SUBQ $128, autLen
   373  
   374  		MOVOU (16*0)(aut), X0
   375  		MOVOU (16*1)(aut), X1
   376  		MOVOU (16*2)(aut), X2
   377  		MOVOU (16*3)(aut), X3
   378  		MOVOU (16*4)(aut), X4
   379  		MOVOU (16*5)(aut), X5
   380  		MOVOU (16*6)(aut), X6
   381  		MOVOU (16*7)(aut), X7
   382  		LEAQ (16*8)(aut), aut
   383  		PSHUFB BSWAP, X0
   384  		PSHUFB BSWAP, X1
   385  		PSHUFB BSWAP, X2
   386  		PSHUFB BSWAP, X3
   387  		PSHUFB BSWAP, X4
   388  		PSHUFB BSWAP, X5
   389  		PSHUFB BSWAP, X6
   390  		PSHUFB BSWAP, X7
   391  		PXOR ACC0, X0
   392  
   393  		MOVOU (16*0)(pTbl), ACC0
   394  		MOVOU (16*1)(pTbl), ACCM
   395  		MOVOU ACC0, ACC1
   396  		PSHUFD $78, X0, T1
   397  		PXOR X0, T1
   398  		PCLMULQDQ $0x00, X0, ACC0
   399  		PCLMULQDQ $0x11, X0, ACC1
   400  		PCLMULQDQ $0x00, T1, ACCM
   401  
   402  		mulRoundAAD(X1, 1)
   403  		mulRoundAAD(X2, 2)
   404  		mulRoundAAD(X3, 3)
   405  		mulRoundAAD(X4, 4)
   406  		mulRoundAAD(X5, 5)
   407  		mulRoundAAD(X6, 6)
   408  		mulRoundAAD(X7, 7)
   409  
   410  		PXOR ACC0, ACCM
   411  		PXOR ACC1, ACCM
   412  		MOVOU ACCM, T0
   413  		PSRLDQ $8, ACCM
   414  		PSLLDQ $8, T0
   415  		PXOR ACCM, ACC1
   416  		PXOR T0, ACC0
   417  		reduceRound(ACC0)
   418  		reduceRound(ACC0)
   419  		PXOR ACC1, ACC0
   420  	JMP dataOctaLoop
   421  
   422  startSinglesLoop:
   423  	MOVOU (16*14)(pTbl), T1
   424  	MOVOU (16*15)(pTbl), T2
   425  
   426  dataSinglesLoop:
   427  
   428  		CMPQ autLen, $16
   429  		JB dataEnd
   430  		SUBQ $16, autLen
   431  
   432  		MOVOU (aut), B0
   433  dataMul:
   434  		PSHUFB BSWAP, B0
   435  		PXOR ACC0, B0
   436  
   437  		MOVOU T1, ACC0
   438  		MOVOU T2, ACCM
   439  		MOVOU T1, ACC1
   440  
   441  		PSHUFD $78, B0, T0
   442  		PXOR B0, T0
   443  		PCLMULQDQ $0x00, B0, ACC0
   444  		PCLMULQDQ $0x11, B0, ACC1
   445  		PCLMULQDQ $0x00, T0, ACCM
   446  
   447  		PXOR ACC0, ACCM
   448  		PXOR ACC1, ACCM
   449  		MOVOU ACCM, T0
   450  		PSRLDQ $8, ACCM
   451  		PSLLDQ $8, T0
   452  		PXOR ACCM, ACC1
   453  		PXOR T0, ACC0
   454  
   455  		MOVOU POLY, T0
   456  		PCLMULQDQ $0x01, ACC0, T0
   457  		PSHUFD $78, ACC0, ACC0
   458  		PXOR T0, ACC0
   459  
   460  		MOVOU POLY, T0
   461  		PCLMULQDQ $0x01, ACC0, T0
   462  		PSHUFD $78, ACC0, ACC0
   463  		PXOR T0, ACC0
   464  		PXOR ACC1, ACC0
   465  
   466  		LEAQ 16(aut), aut
   467  
   468  	JMP dataSinglesLoop
   469  
   470  dataEnd:
   471  
   472  	TESTQ autLen, autLen
   473  	JEQ dataBail
   474  
   475  	PXOR B0, B0
   476  	LEAQ -1(aut)(autLen*1), aut
   477  
   478  dataLoadLoop:
   479  
   480  		PSLLDQ $1, B0
   481  		PINSRB $0, (aut), B0
   482  
   483  		LEAQ -1(aut), aut
   484  		DECQ autLen
   485  		JNE dataLoadLoop
   486  
   487  	JMP dataMul
   488  
   489  dataBail:
   490  	MOVOU ACC0, (tPtr)
   491  	RET
   492  #undef pTbl
   493  #undef aut
   494  #undef tPtr
   495  #undef autLen
   496  
   497  // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   498  TEXT ·gcmAesEnc(SB),0,$256-96
   499  #define pTbl DI
   500  #define ctx DX
   501  #define ctrPtr CX
   502  #define ptx SI
   503  #define ks AX
   504  #define tPtr R8
   505  #define ptxLen R9
   506  #define aluCTR R10
   507  #define aluTMP R11
   508  #define aluK R12
   509  #define NR R13
   510  
   511  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   512  #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   513  #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   514  #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   515  #define combinedRound(i) \
   516  	MOVOU (16*i)(ks), T0;\
   517  	AESENC T0, B0;\
   518  	AESENC T0, B1;\
   519  	AESENC T0, B2;\
   520  	AESENC T0, B3;\
   521  	 MOVOU (16*(i*2))(pTbl), T1;\
   522  	 MOVOU T1, T2;\
   523  	AESENC T0, B4;\
   524  	AESENC T0, B5;\
   525  	AESENC T0, B6;\
   526  	AESENC T0, B7;\
   527  	 MOVOU (16*i)(SP), T0;\
   528  	 PCLMULQDQ $0x00, T0, T1;\
   529  	 PXOR T1, ACC0;\
   530  	 PSHUFD $78, T0, T1;\
   531  	 PCLMULQDQ $0x11, T0, T2;\
   532  	 PXOR T1, T0;\
   533  	 PXOR T2, ACC1;\
   534  	 MOVOU (16*(i*2+1))(pTbl), T2;\
   535  	 PCLMULQDQ $0x00, T2, T0;\
   536  	 PXOR T0, ACCM
   537  #define mulRound(i) \
   538  	MOVOU (16*i)(SP), T0;\
   539  	MOVOU (16*(i*2))(pTbl), T1;\
   540  	MOVOU T1, T2;\
   541  	PCLMULQDQ $0x00, T0, T1;\
   542  	PXOR T1, ACC0;\
   543  	PCLMULQDQ $0x11, T0, T2;\
   544  	PXOR T2, ACC1;\
   545  	PSHUFD $78, T0, T1;\
   546  	PXOR T1, T0;\
   547  	MOVOU (16*(i*2+1))(pTbl), T1;\
   548  	PCLMULQDQ $0x00, T0, T1;\
   549  	PXOR T1, ACCM
   550  
   551  	MOVQ productTable+0(FP), pTbl
   552  	MOVQ dst+8(FP), ctx
   553  	MOVQ src_base+32(FP), ptx
   554  	MOVQ src_len+40(FP), ptxLen
   555  	MOVQ ctr+56(FP), ctrPtr
   556  	MOVQ T+64(FP), tPtr
   557  	MOVQ ks_base+72(FP), ks
   558  	MOVQ ks_len+80(FP), NR
   559  
   560  	SHRQ $2, NR
   561  	DECQ NR
   562  
   563  	MOVOU bswapMask<>(SB), BSWAP
   564  	MOVOU gcmPoly<>(SB), POLY
   565  
   566  	MOVOU (tPtr), ACC0
   567  	PXOR ACC1, ACC1
   568  	PXOR ACCM, ACCM
   569  	MOVOU (ctrPtr), B0
   570  	MOVL (3*4)(ctrPtr), aluCTR
   571  	MOVOU (ks), T0
   572  	MOVL (3*4)(ks), aluK
   573  	BSWAPL aluCTR
   574  	BSWAPL aluK
   575  
   576  	PXOR B0, T0
   577  	MOVOU T0, (8*16 + 0*16)(SP)
   578  	increment(0)
   579  
   580  	CMPQ ptxLen, $128
   581  	JB gcmAesEncSingles
   582  	SUBQ $128, ptxLen
   583  
   584  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   585  	MOVOU T0, (8*16 + 1*16)(SP)
   586  	increment(1)
   587  	MOVOU T0, (8*16 + 2*16)(SP)
   588  	increment(2)
   589  	MOVOU T0, (8*16 + 3*16)(SP)
   590  	increment(3)
   591  	MOVOU T0, (8*16 + 4*16)(SP)
   592  	increment(4)
   593  	MOVOU T0, (8*16 + 5*16)(SP)
   594  	increment(5)
   595  	MOVOU T0, (8*16 + 6*16)(SP)
   596  	increment(6)
   597  	MOVOU T0, (8*16 + 7*16)(SP)
   598  	increment(7)
   599  
   600  	MOVOU (8*16 + 0*16)(SP), B0
   601  	MOVOU (8*16 + 1*16)(SP), B1
   602  	MOVOU (8*16 + 2*16)(SP), B2
   603  	MOVOU (8*16 + 3*16)(SP), B3
   604  	MOVOU (8*16 + 4*16)(SP), B4
   605  	MOVOU (8*16 + 5*16)(SP), B5
   606  	MOVOU (8*16 + 6*16)(SP), B6
   607  	MOVOU (8*16 + 7*16)(SP), B7
   608  
   609  	aesRound(1)
   610  	increment(0)
   611  	aesRound(2)
   612  	increment(1)
   613  	aesRound(3)
   614  	increment(2)
   615  	aesRound(4)
   616  	increment(3)
   617  	aesRound(5)
   618  	increment(4)
   619  	aesRound(6)
   620  	increment(5)
   621  	aesRound(7)
   622  	increment(6)
   623  	aesRound(8)
   624  	increment(7)
   625  	aesRound(9)
   626  	MOVOU (16*10)(ks), T0
   627  	CMPQ NR, $12
   628  	JB encLast1
   629  	aesRnd(T0)
   630  	aesRound(11)
   631  	MOVOU (16*12)(ks), T0
   632  	JE encLast1
   633  	aesRnd(T0)
   634  	aesRound(13)
   635  	MOVOU (16*14)(ks), T0
   636  encLast1:
   637  	aesRndLast(T0)
   638  
   639  	MOVOU (16*0)(ptx), T0
   640  	PXOR T0, B0
   641  	MOVOU (16*1)(ptx), T0
   642  	PXOR T0, B1
   643  	MOVOU (16*2)(ptx), T0
   644  	PXOR T0, B2
   645  	MOVOU (16*3)(ptx), T0
   646  	PXOR T0, B3
   647  	MOVOU (16*4)(ptx), T0
   648  	PXOR T0, B4
   649  	MOVOU (16*5)(ptx), T0
   650  	PXOR T0, B5
   651  	MOVOU (16*6)(ptx), T0
   652  	PXOR T0, B6
   653  	MOVOU (16*7)(ptx), T0
   654  	PXOR T0, B7
   655  
   656  	MOVOU B0, (16*0)(ctx)
   657  	PSHUFB BSWAP, B0
   658  	PXOR ACC0, B0
   659  	MOVOU B1, (16*1)(ctx)
   660  	PSHUFB BSWAP, B1
   661  	MOVOU B2, (16*2)(ctx)
   662  	PSHUFB BSWAP, B2
   663  	MOVOU B3, (16*3)(ctx)
   664  	PSHUFB BSWAP, B3
   665  	MOVOU B4, (16*4)(ctx)
   666  	PSHUFB BSWAP, B4
   667  	MOVOU B5, (16*5)(ctx)
   668  	PSHUFB BSWAP, B5
   669  	MOVOU B6, (16*6)(ctx)
   670  	PSHUFB BSWAP, B6
   671  	MOVOU B7, (16*7)(ctx)
   672  	PSHUFB BSWAP, B7
   673  
   674  	MOVOU B0, (16*0)(SP)
   675  	MOVOU B1, (16*1)(SP)
   676  	MOVOU B2, (16*2)(SP)
   677  	MOVOU B3, (16*3)(SP)
   678  	MOVOU B4, (16*4)(SP)
   679  	MOVOU B5, (16*5)(SP)
   680  	MOVOU B6, (16*6)(SP)
   681  	MOVOU B7, (16*7)(SP)
   682  
   683  	LEAQ 128(ptx), ptx
   684  	LEAQ 128(ctx), ctx
   685  
   686  gcmAesEncOctetsLoop:
   687  
   688  		CMPQ ptxLen, $128
   689  		JB gcmAesEncOctetsEnd
   690  		SUBQ $128, ptxLen
   691  
   692  		MOVOU (8*16 + 0*16)(SP), B0
   693  		MOVOU (8*16 + 1*16)(SP), B1
   694  		MOVOU (8*16 + 2*16)(SP), B2
   695  		MOVOU (8*16 + 3*16)(SP), B3
   696  		MOVOU (8*16 + 4*16)(SP), B4
   697  		MOVOU (8*16 + 5*16)(SP), B5
   698  		MOVOU (8*16 + 6*16)(SP), B6
   699  		MOVOU (8*16 + 7*16)(SP), B7
   700  
   701  		MOVOU (16*0)(SP), T0
   702  		PSHUFD $78, T0, T1
   703  		PXOR T0, T1
   704  
   705  		MOVOU (16*0)(pTbl), ACC0
   706  		MOVOU (16*1)(pTbl), ACCM
   707  		MOVOU ACC0, ACC1
   708  
   709  		PCLMULQDQ $0x00, T1, ACCM
   710  		PCLMULQDQ $0x00, T0, ACC0
   711  		PCLMULQDQ $0x11, T0, ACC1
   712  
   713  		combinedRound(1)
   714  		increment(0)
   715  		combinedRound(2)
   716  		increment(1)
   717  		combinedRound(3)
   718  		increment(2)
   719  		combinedRound(4)
   720  		increment(3)
   721  		combinedRound(5)
   722  		increment(4)
   723  		combinedRound(6)
   724  		increment(5)
   725  		combinedRound(7)
   726  		increment(6)
   727  
   728  		aesRound(8)
   729  		increment(7)
   730  
   731  		PXOR ACC0, ACCM
   732  		PXOR ACC1, ACCM
   733  		MOVOU ACCM, T0
   734  		PSRLDQ $8, ACCM
   735  		PSLLDQ $8, T0
   736  		PXOR ACCM, ACC1
   737  		PXOR T0, ACC0
   738  
   739  		reduceRound(ACC0)
   740  		aesRound(9)
   741  
   742  		reduceRound(ACC0)
   743  		PXOR ACC1, ACC0
   744  
   745  		MOVOU (16*10)(ks), T0
   746  		CMPQ NR, $12
   747  		JB encLast2
   748  		aesRnd(T0)
   749  		aesRound(11)
   750  		MOVOU (16*12)(ks), T0
   751  		JE encLast2
   752  		aesRnd(T0)
   753  		aesRound(13)
   754  		MOVOU (16*14)(ks), T0
   755  encLast2:
   756  		aesRndLast(T0)
   757  
   758  		MOVOU (16*0)(ptx), T0
   759  		PXOR T0, B0
   760  		MOVOU (16*1)(ptx), T0
   761  		PXOR T0, B1
   762  		MOVOU (16*2)(ptx), T0
   763  		PXOR T0, B2
   764  		MOVOU (16*3)(ptx), T0
   765  		PXOR T0, B3
   766  		MOVOU (16*4)(ptx), T0
   767  		PXOR T0, B4
   768  		MOVOU (16*5)(ptx), T0
   769  		PXOR T0, B5
   770  		MOVOU (16*6)(ptx), T0
   771  		PXOR T0, B6
   772  		MOVOU (16*7)(ptx), T0
   773  		PXOR T0, B7
   774  
   775  		MOVOU B0, (16*0)(ctx)
   776  		PSHUFB BSWAP, B0
   777  		PXOR ACC0, B0
   778  		MOVOU B1, (16*1)(ctx)
   779  		PSHUFB BSWAP, B1
   780  		MOVOU B2, (16*2)(ctx)
   781  		PSHUFB BSWAP, B2
   782  		MOVOU B3, (16*3)(ctx)
   783  		PSHUFB BSWAP, B3
   784  		MOVOU B4, (16*4)(ctx)
   785  		PSHUFB BSWAP, B4
   786  		MOVOU B5, (16*5)(ctx)
   787  		PSHUFB BSWAP, B5
   788  		MOVOU B6, (16*6)(ctx)
   789  		PSHUFB BSWAP, B6
   790  		MOVOU B7, (16*7)(ctx)
   791  		PSHUFB BSWAP, B7
   792  
   793  		MOVOU B0, (16*0)(SP)
   794  		MOVOU B1, (16*1)(SP)
   795  		MOVOU B2, (16*2)(SP)
   796  		MOVOU B3, (16*3)(SP)
   797  		MOVOU B4, (16*4)(SP)
   798  		MOVOU B5, (16*5)(SP)
   799  		MOVOU B6, (16*6)(SP)
   800  		MOVOU B7, (16*7)(SP)
   801  
   802  		LEAQ 128(ptx), ptx
   803  		LEAQ 128(ctx), ctx
   804  
   805  		JMP gcmAesEncOctetsLoop
   806  
   807  gcmAesEncOctetsEnd:
   808  
   809  	MOVOU (16*0)(SP), T0
   810  	MOVOU (16*0)(pTbl), ACC0
   811  	MOVOU (16*1)(pTbl), ACCM
   812  	MOVOU ACC0, ACC1
   813  	PSHUFD $78, T0, T1
   814  	PXOR T0, T1
   815  	PCLMULQDQ $0x00, T0, ACC0
   816  	PCLMULQDQ $0x11, T0, ACC1
   817  	PCLMULQDQ $0x00, T1, ACCM
   818  
   819  	mulRound(1)
   820  	mulRound(2)
   821  	mulRound(3)
   822  	mulRound(4)
   823  	mulRound(5)
   824  	mulRound(6)
   825  	mulRound(7)
   826  
   827  	PXOR ACC0, ACCM
   828  	PXOR ACC1, ACCM
   829  	MOVOU ACCM, T0
   830  	PSRLDQ $8, ACCM
   831  	PSLLDQ $8, T0
   832  	PXOR ACCM, ACC1
   833  	PXOR T0, ACC0
   834  
   835  	reduceRound(ACC0)
   836  	reduceRound(ACC0)
   837  	PXOR ACC1, ACC0
   838  
   839  	TESTQ ptxLen, ptxLen
   840  	JE gcmAesEncDone
   841  
   842  	SUBQ $7, aluCTR
   843  
   844  gcmAesEncSingles:
   845  
   846  	MOVOU (16*1)(ks), B1
   847  	MOVOU (16*2)(ks), B2
   848  	MOVOU (16*3)(ks), B3
   849  	MOVOU (16*4)(ks), B4
   850  	MOVOU (16*5)(ks), B5
   851  	MOVOU (16*6)(ks), B6
   852  	MOVOU (16*7)(ks), B7
   853  
   854  	MOVOU (16*14)(pTbl), T2
   855  
   856  gcmAesEncSinglesLoop:
   857  
   858  		CMPQ ptxLen, $16
   859  		JB gcmAesEncTail
   860  		SUBQ $16, ptxLen
   861  
   862  		MOVOU (8*16 + 0*16)(SP), B0
   863  		increment(0)
   864  
   865  		AESENC B1, B0
   866  		AESENC B2, B0
   867  		AESENC B3, B0
   868  		AESENC B4, B0
   869  		AESENC B5, B0
   870  		AESENC B6, B0
   871  		AESENC B7, B0
   872  		MOVOU (16*8)(ks), T0
   873  		AESENC T0, B0
   874  		MOVOU (16*9)(ks), T0
   875  		AESENC T0, B0
   876  		MOVOU (16*10)(ks), T0
   877  		CMPQ NR, $12
   878  		JB encLast3
   879  		AESENC T0, B0
   880  		MOVOU (16*11)(ks), T0
   881  		AESENC T0, B0
   882  		MOVOU (16*12)(ks), T0
   883  		JE encLast3
   884  		AESENC T0, B0
   885  		MOVOU (16*13)(ks), T0
   886  		AESENC T0, B0
   887  		MOVOU (16*14)(ks), T0
   888  encLast3:
   889  		AESENCLAST T0, B0
   890  
   891  		MOVOU (ptx), T0
   892  		PXOR T0, B0
   893  		MOVOU B0, (ctx)
   894  
   895  		PSHUFB BSWAP, B0
   896  		PXOR ACC0, B0
   897  
   898  		MOVOU T2, ACC0
   899  		MOVOU T2, ACC1
   900  		MOVOU (16*15)(pTbl), ACCM
   901  
   902  		PSHUFD $78, B0, T0
   903  		PXOR B0, T0
   904  		PCLMULQDQ $0x00, B0, ACC0
   905  		PCLMULQDQ $0x11, B0, ACC1
   906  		PCLMULQDQ $0x00, T0, ACCM
   907  
   908  		PXOR ACC0, ACCM
   909  		PXOR ACC1, ACCM
   910  		MOVOU ACCM, T0
   911  		PSRLDQ $8, ACCM
   912  		PSLLDQ $8, T0
   913  		PXOR ACCM, ACC1
   914  		PXOR T0, ACC0
   915  
   916  		reduceRound(ACC0)
   917  		reduceRound(ACC0)
   918  		PXOR ACC1, ACC0
   919  
   920  		LEAQ (16*1)(ptx), ptx
   921  		LEAQ (16*1)(ctx), ctx
   922  
   923  	JMP gcmAesEncSinglesLoop
   924  
   925  gcmAesEncTail:
   926  	TESTQ ptxLen, ptxLen
   927  	JE gcmAesEncDone
   928  
   929  	MOVOU (8*16 + 0*16)(SP), B0
   930  	AESENC B1, B0
   931  	AESENC B2, B0
   932  	AESENC B3, B0
   933  	AESENC B4, B0
   934  	AESENC B5, B0
   935  	AESENC B6, B0
   936  	AESENC B7, B0
   937  	MOVOU (16*8)(ks), T0
   938  	AESENC T0, B0
   939  	MOVOU (16*9)(ks), T0
   940  	AESENC T0, B0
   941  	MOVOU (16*10)(ks), T0
   942  	CMPQ NR, $12
   943  	JB encLast4
   944  	AESENC T0, B0
   945  	MOVOU (16*11)(ks), T0
   946  	AESENC T0, B0
   947  	MOVOU (16*12)(ks), T0
   948  	JE encLast4
   949  	AESENC T0, B0
   950  	MOVOU (16*13)(ks), T0
   951  	AESENC T0, B0
   952  	MOVOU (16*14)(ks), T0
   953  encLast4:
   954  	AESENCLAST T0, B0
   955  	MOVOU B0, T0
   956  
   957  	LEAQ -1(ptx)(ptxLen*1), ptx
   958  
   959  	MOVQ ptxLen, aluTMP
   960  	SHLQ $4, aluTMP
   961  
   962  	LEAQ andMask<>(SB), aluCTR
   963  	MOVOU -16(aluCTR)(aluTMP*1), T1
   964  
   965  	PXOR B0, B0
   966  ptxLoadLoop:
   967  		PSLLDQ $1, B0
   968  		PINSRB $0, (ptx), B0
   969  		LEAQ -1(ptx), ptx
   970  		DECQ ptxLen
   971  	JNE ptxLoadLoop
   972  
   973  	PXOR T0, B0
   974  	PAND T1, B0
   975  	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   976  
   977  	PSHUFB BSWAP, B0
   978  	PXOR ACC0, B0
   979  
   980  	MOVOU T2, ACC0
   981  	MOVOU T2, ACC1
   982  	MOVOU (16*15)(pTbl), ACCM
   983  
   984  	PSHUFD $78, B0, T0
   985  	PXOR B0, T0
   986  	PCLMULQDQ $0x00, B0, ACC0
   987  	PCLMULQDQ $0x11, B0, ACC1
   988  	PCLMULQDQ $0x00, T0, ACCM
   989  
   990  	PXOR ACC0, ACCM
   991  	PXOR ACC1, ACCM
   992  	MOVOU ACCM, T0
   993  	PSRLDQ $8, ACCM
   994  	PSLLDQ $8, T0
   995  	PXOR ACCM, ACC1
   996  	PXOR T0, ACC0
   997  
   998  	reduceRound(ACC0)
   999  	reduceRound(ACC0)
  1000  	PXOR ACC1, ACC0
  1001  
  1002  gcmAesEncDone:
  1003  	MOVOU ACC0, (tPtr)
  1004  	RET
  1005  #undef increment
  1006  
  1007  // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
  1008  TEXT ·gcmAesDec(SB),0,$128-96
  1009  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
  1010  #define combinedDecRound(i) \
  1011  	MOVOU (16*i)(ks), T0;\
  1012  	AESENC T0, B0;\
  1013  	AESENC T0, B1;\
  1014  	AESENC T0, B2;\
  1015  	AESENC T0, B3;\
  1016  	MOVOU (16*(i*2))(pTbl), T1;\
  1017  	MOVOU T1, T2;\
  1018  	AESENC T0, B4;\
  1019  	AESENC T0, B5;\
  1020  	AESENC T0, B6;\
  1021  	AESENC T0, B7;\
  1022  	MOVOU (16*i)(ctx), T0;\
  1023  	PSHUFB BSWAP, T0;\
  1024  	PCLMULQDQ $0x00, T0, T1;\
  1025  	PXOR T1, ACC0;\
  1026  	PSHUFD $78, T0, T1;\
  1027  	PCLMULQDQ $0x11, T0, T2;\
  1028  	PXOR T1, T0;\
  1029  	PXOR T2, ACC1;\
  1030  	MOVOU (16*(i*2+1))(pTbl), T2;\
  1031  	PCLMULQDQ $0x00, T2, T0;\
  1032  	PXOR T0, ACCM
  1033  
  1034  	MOVQ productTable+0(FP), pTbl
  1035  	MOVQ dst+8(FP), ptx
  1036  	MOVQ src_base+32(FP), ctx
  1037  	MOVQ src_len+40(FP), ptxLen
  1038  	MOVQ ctr+56(FP), ctrPtr
  1039  	MOVQ T+64(FP), tPtr
  1040  	MOVQ ks_base+72(FP), ks
  1041  	MOVQ ks_len+80(FP), NR
  1042  
  1043  	SHRQ $2, NR
  1044  	DECQ NR
  1045  
  1046  	MOVOU bswapMask<>(SB), BSWAP
  1047  	MOVOU gcmPoly<>(SB), POLY
  1048  
  1049  	MOVOU (tPtr), ACC0
  1050  	PXOR ACC1, ACC1
  1051  	PXOR ACCM, ACCM
  1052  	MOVOU (ctrPtr), B0
  1053  	MOVL (3*4)(ctrPtr), aluCTR
  1054  	MOVOU (ks), T0
  1055  	MOVL (3*4)(ks), aluK
  1056  	BSWAPL aluCTR
  1057  	BSWAPL aluK
  1058  
  1059  	PXOR B0, T0
  1060  	MOVOU T0, (0*16)(SP)
  1061  	increment(0)
  1062  
  1063  	CMPQ ptxLen, $128
  1064  	JB gcmAesDecSingles
  1065  
  1066  	MOVOU T0, (1*16)(SP)
  1067  	increment(1)
  1068  	MOVOU T0, (2*16)(SP)
  1069  	increment(2)
  1070  	MOVOU T0, (3*16)(SP)
  1071  	increment(3)
  1072  	MOVOU T0, (4*16)(SP)
  1073  	increment(4)
  1074  	MOVOU T0, (5*16)(SP)
  1075  	increment(5)
  1076  	MOVOU T0, (6*16)(SP)
  1077  	increment(6)
  1078  	MOVOU T0, (7*16)(SP)
  1079  	increment(7)
  1080  
  1081  gcmAesDecOctetsLoop:
  1082  
  1083  		CMPQ ptxLen, $128
  1084  		JB gcmAesDecEndOctets
  1085  		SUBQ $128, ptxLen
  1086  
  1087  		MOVOU (0*16)(SP), B0
  1088  		MOVOU (1*16)(SP), B1
  1089  		MOVOU (2*16)(SP), B2
  1090  		MOVOU (3*16)(SP), B3
  1091  		MOVOU (4*16)(SP), B4
  1092  		MOVOU (5*16)(SP), B5
  1093  		MOVOU (6*16)(SP), B6
  1094  		MOVOU (7*16)(SP), B7
  1095  
  1096  		MOVOU (16*0)(ctx), T0
  1097  		PSHUFB BSWAP, T0
  1098  		PXOR ACC0, T0
  1099  		PSHUFD $78, T0, T1
  1100  		PXOR T0, T1
  1101  
  1102  		MOVOU (16*0)(pTbl), ACC0
  1103  		MOVOU (16*1)(pTbl), ACCM
  1104  		MOVOU ACC0, ACC1
  1105  
  1106  		PCLMULQDQ $0x00, T1, ACCM
  1107  		PCLMULQDQ $0x00, T0, ACC0
  1108  		PCLMULQDQ $0x11, T0, ACC1
  1109  
  1110  		combinedDecRound(1)
  1111  		increment(0)
  1112  		combinedDecRound(2)
  1113  		increment(1)
  1114  		combinedDecRound(3)
  1115  		increment(2)
  1116  		combinedDecRound(4)
  1117  		increment(3)
  1118  		combinedDecRound(5)
  1119  		increment(4)
  1120  		combinedDecRound(6)
  1121  		increment(5)
  1122  		combinedDecRound(7)
  1123  		increment(6)
  1124  
  1125  		aesRound(8)
  1126  		increment(7)
  1127  
  1128  		PXOR ACC0, ACCM
  1129  		PXOR ACC1, ACCM
  1130  		MOVOU ACCM, T0
  1131  		PSRLDQ $8, ACCM
  1132  		PSLLDQ $8, T0
  1133  		PXOR ACCM, ACC1
  1134  		PXOR T0, ACC0
  1135  
  1136  		reduceRound(ACC0)
  1137  		aesRound(9)
  1138  
  1139  		reduceRound(ACC0)
  1140  		PXOR ACC1, ACC0
  1141  
  1142  		MOVOU (16*10)(ks), T0
  1143  		CMPQ NR, $12
  1144  		JB decLast1
  1145  		aesRnd(T0)
  1146  		aesRound(11)
  1147  		MOVOU (16*12)(ks), T0
  1148  		JE decLast1
  1149  		aesRnd(T0)
  1150  		aesRound(13)
  1151  		MOVOU (16*14)(ks), T0
  1152  decLast1:
  1153  		aesRndLast(T0)
  1154  
  1155  		MOVOU (16*0)(ctx), T0
  1156  		PXOR T0, B0
  1157  		MOVOU (16*1)(ctx), T0
  1158  		PXOR T0, B1
  1159  		MOVOU (16*2)(ctx), T0
  1160  		PXOR T0, B2
  1161  		MOVOU (16*3)(ctx), T0
  1162  		PXOR T0, B3
  1163  		MOVOU (16*4)(ctx), T0
  1164  		PXOR T0, B4
  1165  		MOVOU (16*5)(ctx), T0
  1166  		PXOR T0, B5
  1167  		MOVOU (16*6)(ctx), T0
  1168  		PXOR T0, B6
  1169  		MOVOU (16*7)(ctx), T0
  1170  		PXOR T0, B7
  1171  
  1172  		MOVOU B0, (16*0)(ptx)
  1173  		MOVOU B1, (16*1)(ptx)
  1174  		MOVOU B2, (16*2)(ptx)
  1175  		MOVOU B3, (16*3)(ptx)
  1176  		MOVOU B4, (16*4)(ptx)
  1177  		MOVOU B5, (16*5)(ptx)
  1178  		MOVOU B6, (16*6)(ptx)
  1179  		MOVOU B7, (16*7)(ptx)
  1180  
  1181  		LEAQ 128(ptx), ptx
  1182  		LEAQ 128(ctx), ctx
  1183  
  1184  		JMP gcmAesDecOctetsLoop
  1185  
  1186  gcmAesDecEndOctets:
  1187  
  1188  	SUBQ $7, aluCTR
  1189  
  1190  gcmAesDecSingles:
  1191  
  1192  	MOVOU (16*1)(ks), B1
  1193  	MOVOU (16*2)(ks), B2
  1194  	MOVOU (16*3)(ks), B3
  1195  	MOVOU (16*4)(ks), B4
  1196  	MOVOU (16*5)(ks), B5
  1197  	MOVOU (16*6)(ks), B6
  1198  	MOVOU (16*7)(ks), B7
  1199  
  1200  	MOVOU (16*14)(pTbl), T2
  1201  
  1202  gcmAesDecSinglesLoop:
  1203  
  1204  		CMPQ ptxLen, $16
  1205  		JB gcmAesDecTail
  1206  		SUBQ $16, ptxLen
  1207  
  1208  		MOVOU (ctx), B0
  1209  		MOVOU B0, T1
  1210  		PSHUFB BSWAP, B0
  1211  		PXOR ACC0, B0
  1212  
  1213  		MOVOU T2, ACC0
  1214  		MOVOU T2, ACC1
  1215  		MOVOU (16*15)(pTbl), ACCM
  1216  
  1217  		PCLMULQDQ $0x00, B0, ACC0
  1218  		PCLMULQDQ $0x11, B0, ACC1
  1219  		PSHUFD $78, B0, T0
  1220  		PXOR B0, T0
  1221  		PCLMULQDQ $0x00, T0, ACCM
  1222  
  1223  		PXOR ACC0, ACCM
  1224  		PXOR ACC1, ACCM
  1225  		MOVOU ACCM, T0
  1226  		PSRLDQ $8, ACCM
  1227  		PSLLDQ $8, T0
  1228  		PXOR ACCM, ACC1
  1229  		PXOR T0, ACC0
  1230  
  1231  		reduceRound(ACC0)
  1232  		reduceRound(ACC0)
  1233  		PXOR ACC1, ACC0
  1234  
  1235  		MOVOU (0*16)(SP), B0
  1236  		increment(0)
  1237  		AESENC B1, B0
  1238  		AESENC B2, B0
  1239  		AESENC B3, B0
  1240  		AESENC B4, B0
  1241  		AESENC B5, B0
  1242  		AESENC B6, B0
  1243  		AESENC B7, B0
  1244  		MOVOU (16*8)(ks), T0
  1245  		AESENC T0, B0
  1246  		MOVOU (16*9)(ks), T0
  1247  		AESENC T0, B0
  1248  		MOVOU (16*10)(ks), T0
  1249  		CMPQ NR, $12
  1250  		JB decLast2
  1251  		AESENC T0, B0
  1252  		MOVOU (16*11)(ks), T0
  1253  		AESENC T0, B0
  1254  		MOVOU (16*12)(ks), T0
  1255  		JE decLast2
  1256  		AESENC T0, B0
  1257  		MOVOU (16*13)(ks), T0
  1258  		AESENC T0, B0
  1259  		MOVOU (16*14)(ks), T0
  1260  decLast2:
  1261  		AESENCLAST T0, B0
  1262  
  1263  		PXOR T1, B0
  1264  		MOVOU B0, (ptx)
  1265  
  1266  		LEAQ (16*1)(ptx), ptx
  1267  		LEAQ (16*1)(ctx), ctx
  1268  
  1269  	JMP gcmAesDecSinglesLoop
  1270  
  1271  gcmAesDecTail:
  1272  
  1273  	TESTQ ptxLen, ptxLen
  1274  	JE gcmAesDecDone
  1275  
  1276  	MOVQ ptxLen, aluTMP
  1277  	SHLQ $4, aluTMP
  1278  	LEAQ andMask<>(SB), aluCTR
  1279  	MOVOU -16(aluCTR)(aluTMP*1), T1
  1280  
  1281  	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1282  	PAND T1, B0
  1283  
  1284  	MOVOU B0, T1
  1285  	PSHUFB BSWAP, B0
  1286  	PXOR ACC0, B0
  1287  
  1288  	MOVOU (16*14)(pTbl), ACC0
  1289  	MOVOU (16*15)(pTbl), ACCM
  1290  	MOVOU ACC0, ACC1
  1291  
  1292  	PCLMULQDQ $0x00, B0, ACC0
  1293  	PCLMULQDQ $0x11, B0, ACC1
  1294  	PSHUFD $78, B0, T0
  1295  	PXOR B0, T0
  1296  	PCLMULQDQ $0x00, T0, ACCM
  1297  
  1298  	PXOR ACC0, ACCM
  1299  	PXOR ACC1, ACCM
  1300  	MOVOU ACCM, T0
  1301  	PSRLDQ $8, ACCM
  1302  	PSLLDQ $8, T0
  1303  	PXOR ACCM, ACC1
  1304  	PXOR T0, ACC0
  1305  
  1306  	reduceRound(ACC0)
  1307  	reduceRound(ACC0)
  1308  	PXOR ACC1, ACC0
  1309  
  1310  	MOVOU (0*16)(SP), B0
  1311  	increment(0)
  1312  	AESENC B1, B0
  1313  	AESENC B2, B0
  1314  	AESENC B3, B0
  1315  	AESENC B4, B0
  1316  	AESENC B5, B0
  1317  	AESENC B6, B0
  1318  	AESENC B7, B0
  1319  	MOVOU (16*8)(ks), T0
  1320  	AESENC T0, B0
  1321  	MOVOU (16*9)(ks), T0
  1322  	AESENC T0, B0
  1323  	MOVOU (16*10)(ks), T0
  1324  	CMPQ NR, $12
  1325  	JB decLast3
  1326  	AESENC T0, B0
  1327  	MOVOU (16*11)(ks), T0
  1328  	AESENC T0, B0
  1329  	MOVOU (16*12)(ks), T0
  1330  	JE decLast3
  1331  	AESENC T0, B0
  1332  	MOVOU (16*13)(ks), T0
  1333  	AESENC T0, B0
  1334  	MOVOU (16*14)(ks), T0
  1335  decLast3:
  1336  	AESENCLAST T0, B0
  1337  	PXOR T1, B0
  1338  
  1339  ptxStoreLoop:
  1340  		PEXTRB $0, B0, (ptx)
  1341  		PSRLDQ $1, B0
  1342  		LEAQ 1(ptx), ptx
  1343  		DECQ ptxLen
  1344  
  1345  	JNE ptxStoreLoop
  1346  
  1347  gcmAesDecDone:
  1348  
  1349  	MOVOU ACC0, (tPtr)
  1350  	RET