github.com/epfl-dcsl/gotee@v0.0.0-20200909122901-014b35f5e5e9/src/crypto/aes/gcm_amd64.s (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     6  // The implementation uses some optimization as described in:
     7  // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     8  //     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     9  // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    10  //     Hardware
    11  
    12  #include "textflag.h"
    13  
    14  #define B0 X0
    15  #define B1 X1
    16  #define B2 X2
    17  #define B3 X3
    18  #define B4 X4
    19  #define B5 X5
    20  #define B6 X6
    21  #define B7 X7
    22  
    23  #define ACC0 X8
    24  #define ACC1 X9
    25  #define ACCM X10
    26  
    27  #define T0 X11
    28  #define T1 X12
    29  #define T2 X13
    30  #define POLY X14
    31  #define BSWAP X15
    32  
    33  DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    34  DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    35  
    36  DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    37  DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    38  
    39  DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    40  DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    41  DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    42  DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    43  DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    44  DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    45  DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    46  DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    47  DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    48  DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    49  DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    50  DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    51  DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    52  DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    53  DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    54  DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    55  DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    56  DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    57  DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    58  DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    59  DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    60  DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    61  DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    62  DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    63  DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    64  DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    65  DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    66  DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    67  DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    68  DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    69  
    70  GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    71  GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    72  GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    73  
    74  // func hasGCMAsm() bool
    75  // returns whether AES-NI AND CLMUL-NI are supported
    76  TEXT ·hasGCMAsm(SB),NOSPLIT,$0
    77  	// @aghosn we want to avoid the cpuid
    78  	MOVB runtime·isEnclave(SB), R8
    79  	CMPB R8, $1
    80  	JNE normal
    81  	MOVB $1, CX
    82  	JMP end
    83  
    84  normal:
    85  	XORQ AX, AX
    86  	INCL AX
    87  	CPUID
    88  	MOVQ CX, DX
    89  	SHRQ $25, CX
    90  	SHRQ $1, DX
    91  	ANDQ DX, CX
    92  	ANDQ $1, CX
    93  end:
    94  	MOVB CX, ret+0(FP)
    95  	RET
    96  
    97  // func aesEncBlock(dst, src *[16]byte, ks []uint32)
    98  TEXT ·aesEncBlock(SB),NOSPLIT,$0
    99  	MOVQ dst+0(FP), DI
   100  	MOVQ src+8(FP), SI
   101  	MOVQ ks_base+16(FP), DX
   102  	MOVQ ks_len+24(FP), CX
   103  
   104  	SHRQ $2, CX
   105  	DECQ CX
   106  
   107  	MOVOU (SI), X0
   108  	MOVOU (16*0)(DX), X1
   109  	PXOR X1, X0
   110  	MOVOU (16*1)(DX), X1
   111  	AESENC X1, X0
   112  	MOVOU (16*2)(DX), X1
   113  	AESENC X1, X0
   114  	MOVOU (16*3)(DX), X1
   115  	AESENC X1, X0
   116  	MOVOU (16*4)(DX), X1
   117  	AESENC X1, X0
   118  	MOVOU (16*5)(DX), X1
   119  	AESENC X1, X0
   120  	MOVOU (16*6)(DX), X1
   121  	AESENC X1, X0
   122  	MOVOU (16*7)(DX), X1
   123  	AESENC X1, X0
   124  	MOVOU (16*8)(DX), X1
   125  	AESENC X1, X0
   126  	MOVOU (16*9)(DX), X1
   127  	AESENC X1, X0
   128  	MOVOU (16*10)(DX), X1
   129  	CMPQ CX, $12
   130  	JB encLast
   131  	AESENC X1, X0
   132  	MOVOU (16*11)(DX), X1
   133  	AESENC X1, X0
   134  	MOVOU (16*12)(DX), X1
   135  	JE encLast
   136  	AESENC X1, X0
   137  	MOVOU (16*13)(DX), X1
   138  	AESENC X1, X0
   139  	MOVOU (16*14)(DX), X1
   140  
   141  encLast:
   142  	AESENCLAST X1, X0
   143  	MOVOU X0, (DI)
   144  
   145  	RET
   146  
   147  // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
   148  TEXT ·gcmAesFinish(SB),NOSPLIT,$0
   149  #define pTbl DI
   150  #define tMsk SI
   151  #define tPtr DX
   152  #define plen AX
   153  #define dlen CX
   154  
   155  	MOVQ productTable+0(FP), pTbl
   156  	MOVQ tagMask+8(FP), tMsk
   157  	MOVQ T+16(FP), tPtr
   158  	MOVQ pLen+24(FP), plen
   159  	MOVQ dLen+32(FP), dlen
   160  
   161  	MOVOU (tPtr), ACC0
   162  	MOVOU (tMsk), T2
   163  
   164  	MOVOU bswapMask<>(SB), BSWAP
   165  	MOVOU gcmPoly<>(SB), POLY
   166  
   167  	SHLQ $3, plen
   168  	SHLQ $3, dlen
   169  
   170  	MOVQ plen, B0
   171  	PINSRQ $1, dlen, B0
   172  
   173  	PXOR ACC0, B0
   174  
   175  	MOVOU (16*14)(pTbl), ACC0
   176  	MOVOU (16*15)(pTbl), ACCM
   177  	MOVOU ACC0, ACC1
   178  
   179  	PCLMULQDQ $0x00, B0, ACC0
   180  	PCLMULQDQ $0x11, B0, ACC1
   181  	PSHUFD $78, B0, T0
   182  	PXOR B0, T0
   183  	PCLMULQDQ $0x00, T0, ACCM
   184  
   185  	PXOR ACC0, ACCM
   186  	PXOR ACC1, ACCM
   187  	MOVOU ACCM, T0
   188  	PSRLDQ $8, ACCM
   189  	PSLLDQ $8, T0
   190  	PXOR ACCM, ACC1
   191  	PXOR T0, ACC0
   192  
   193  	MOVOU POLY, T0
   194  	PCLMULQDQ $0x01, ACC0, T0
   195  	PSHUFD $78, ACC0, ACC0
   196  	PXOR T0, ACC0
   197  
   198  	MOVOU POLY, T0
   199  	PCLMULQDQ $0x01, ACC0, T0
   200  	PSHUFD $78, ACC0, ACC0
   201  	PXOR T0, ACC0
   202  
   203  	PXOR ACC1, ACC0
   204  
   205  	PSHUFB BSWAP, ACC0
   206  	PXOR T2, ACC0
   207  	MOVOU ACC0, (tPtr)
   208  
   209  	RET
   210  #undef pTbl
   211  #undef tMsk
   212  #undef tPtr
   213  #undef plen
   214  #undef dlen
   215  
   216  // func gcmAesInit(productTable *[256]byte, ks []uint32)
   217  TEXT ·gcmAesInit(SB),NOSPLIT,$0
   218  #define dst DI
   219  #define KS SI
   220  #define NR DX
   221  
   222  	MOVQ productTable+0(FP), dst
   223  	MOVQ ks_base+8(FP), KS
   224  	MOVQ ks_len+16(FP), NR
   225  
   226  	SHRQ $2, NR
   227  	DECQ NR
   228  
   229  	MOVOU bswapMask<>(SB), BSWAP
   230  	MOVOU gcmPoly<>(SB), POLY
   231  
   232  	// Encrypt block 0, with the AES key to generate the hash key H
   233  	MOVOU (16*0)(KS), B0
   234  	MOVOU (16*1)(KS), T0
   235  	AESENC T0, B0
   236  	MOVOU (16*2)(KS), T0
   237  	AESENC T0, B0
   238  	MOVOU (16*3)(KS), T0
   239  	AESENC T0, B0
   240  	MOVOU (16*4)(KS), T0
   241  	AESENC T0, B0
   242  	MOVOU (16*5)(KS), T0
   243  	AESENC T0, B0
   244  	MOVOU (16*6)(KS), T0
   245  	AESENC T0, B0
   246  	MOVOU (16*7)(KS), T0
   247  	AESENC T0, B0
   248  	MOVOU (16*8)(KS), T0
   249  	AESENC T0, B0
   250  	MOVOU (16*9)(KS), T0
   251  	AESENC T0, B0
   252  	MOVOU (16*10)(KS), T0
   253  	CMPQ NR, $12
   254  	JB initEncLast
   255  	AESENC T0, B0
   256  	MOVOU (16*11)(KS), T0
   257  	AESENC T0, B0
   258  	MOVOU (16*12)(KS), T0
   259  	JE initEncLast
   260  	AESENC T0, B0
   261  	MOVOU (16*13)(KS), T0
   262  	AESENC T0, B0
   263  	MOVOU (16*14)(KS), T0
   264  initEncLast:
   265  	AESENCLAST T0, B0
   266  
   267  	PSHUFB BSWAP, B0
   268  	// H * 2
   269  	PSHUFD $0xff, B0, T0
   270  	MOVOU B0, T1
   271  	PSRAL $31, T0
   272  	PAND POLY, T0
   273  	PSRLL $31, T1
   274  	PSLLDQ $4, T1
   275  	PSLLL $1, B0
   276  	PXOR T0, B0
   277  	PXOR T1, B0
   278  	// Karatsuba pre-computations
   279  	MOVOU B0, (16*14)(dst)
   280  	PSHUFD $78, B0, B1
   281  	PXOR B0, B1
   282  	MOVOU B1, (16*15)(dst)
   283  
   284  	MOVOU B0, B2
   285  	MOVOU B1, B3
   286  	// Now prepare powers of H and pre-computations for them
   287  	MOVQ $7, AX
   288  
   289  initLoop:
   290  		MOVOU B2, T0
   291  		MOVOU B2, T1
   292  		MOVOU B3, T2
   293  		PCLMULQDQ $0x00, B0, T0
   294  		PCLMULQDQ $0x11, B0, T1
   295  		PCLMULQDQ $0x00, B1, T2
   296  
   297  		PXOR T0, T2
   298  		PXOR T1, T2
   299  		MOVOU T2, B4
   300  		PSLLDQ $8, B4
   301  		PSRLDQ $8, T2
   302  		PXOR B4, T0
   303  		PXOR T2, T1
   304  
   305  		MOVOU POLY, B2
   306  		PCLMULQDQ $0x01, T0, B2
   307  		PSHUFD $78, T0, T0
   308  		PXOR B2, T0
   309  		MOVOU POLY, B2
   310  		PCLMULQDQ $0x01, T0, B2
   311  		PSHUFD $78, T0, T0
   312  		PXOR T0, B2
   313  		PXOR T1, B2
   314  
   315  		MOVOU B2, (16*12)(dst)
   316  		PSHUFD $78, B2, B3
   317  		PXOR B2, B3
   318  		MOVOU B3, (16*13)(dst)
   319  
   320  		DECQ AX
   321  		LEAQ (-16*2)(dst), dst
   322  	JNE initLoop
   323  
   324  	RET
   325  #undef NR
   326  #undef KS
   327  #undef dst
   328  
   329  // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   330  TEXT ·gcmAesData(SB),NOSPLIT,$0
   331  #define pTbl DI
   332  #define aut SI
   333  #define tPtr CX
   334  #define autLen DX
   335  
   336  #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   337  #define mulRoundAAD(X ,i) \
   338  	MOVOU (16*(i*2))(pTbl), T1;\
   339  	MOVOU T1, T2;\
   340  	PCLMULQDQ $0x00, X, T1;\
   341  	PXOR T1, ACC0;\
   342  	PCLMULQDQ $0x11, X, T2;\
   343  	PXOR T2, ACC1;\
   344  	PSHUFD $78, X, T1;\
   345  	PXOR T1, X;\
   346  	MOVOU (16*(i*2+1))(pTbl), T1;\
   347  	PCLMULQDQ $0x00, X, T1;\
   348  	PXOR T1, ACCM
   349  
   350  	MOVQ productTable+0(FP), pTbl
   351  	MOVQ data_base+8(FP), aut
   352  	MOVQ data_len+16(FP), autLen
   353  	MOVQ T+32(FP), tPtr
   354  
   355  	PXOR ACC0, ACC0
   356  	MOVOU bswapMask<>(SB), BSWAP
   357  	MOVOU gcmPoly<>(SB), POLY
   358  
   359  	TESTQ autLen, autLen
   360  	JEQ dataBail
   361  
   362  	CMPQ autLen, $13	// optimize the TLS case
   363  	JE dataTLS
   364  	CMPQ autLen, $128
   365  	JB startSinglesLoop
   366  	JMP dataOctaLoop
   367  
   368  dataTLS:
   369  	MOVOU (16*14)(pTbl), T1
   370  	MOVOU (16*15)(pTbl), T2
   371  	PXOR B0, B0
   372  	MOVQ (aut), B0
   373  	PINSRD $2, 8(aut), B0
   374  	PINSRB $12, 12(aut), B0
   375  	XORQ autLen, autLen
   376  	JMP dataMul
   377  
   378  dataOctaLoop:
   379  		CMPQ autLen, $128
   380  		JB startSinglesLoop
   381  		SUBQ $128, autLen
   382  
   383  		MOVOU (16*0)(aut), X0
   384  		MOVOU (16*1)(aut), X1
   385  		MOVOU (16*2)(aut), X2
   386  		MOVOU (16*3)(aut), X3
   387  		MOVOU (16*4)(aut), X4
   388  		MOVOU (16*5)(aut), X5
   389  		MOVOU (16*6)(aut), X6
   390  		MOVOU (16*7)(aut), X7
   391  		LEAQ (16*8)(aut), aut
   392  		PSHUFB BSWAP, X0
   393  		PSHUFB BSWAP, X1
   394  		PSHUFB BSWAP, X2
   395  		PSHUFB BSWAP, X3
   396  		PSHUFB BSWAP, X4
   397  		PSHUFB BSWAP, X5
   398  		PSHUFB BSWAP, X6
   399  		PSHUFB BSWAP, X7
   400  		PXOR ACC0, X0
   401  
   402  		MOVOU (16*0)(pTbl), ACC0
   403  		MOVOU (16*1)(pTbl), ACCM
   404  		MOVOU ACC0, ACC1
   405  		PSHUFD $78, X0, T1
   406  		PXOR X0, T1
   407  		PCLMULQDQ $0x00, X0, ACC0
   408  		PCLMULQDQ $0x11, X0, ACC1
   409  		PCLMULQDQ $0x00, T1, ACCM
   410  
   411  		mulRoundAAD(X1, 1)
   412  		mulRoundAAD(X2, 2)
   413  		mulRoundAAD(X3, 3)
   414  		mulRoundAAD(X4, 4)
   415  		mulRoundAAD(X5, 5)
   416  		mulRoundAAD(X6, 6)
   417  		mulRoundAAD(X7, 7)
   418  
   419  		PXOR ACC0, ACCM
   420  		PXOR ACC1, ACCM
   421  		MOVOU ACCM, T0
   422  		PSRLDQ $8, ACCM
   423  		PSLLDQ $8, T0
   424  		PXOR ACCM, ACC1
   425  		PXOR T0, ACC0
   426  		reduceRound(ACC0)
   427  		reduceRound(ACC0)
   428  		PXOR ACC1, ACC0
   429  	JMP dataOctaLoop
   430  
   431  startSinglesLoop:
   432  	MOVOU (16*14)(pTbl), T1
   433  	MOVOU (16*15)(pTbl), T2
   434  
   435  dataSinglesLoop:
   436  
   437  		CMPQ autLen, $16
   438  		JB dataEnd
   439  		SUBQ $16, autLen
   440  
   441  		MOVOU (aut), B0
   442  dataMul:
   443  		PSHUFB BSWAP, B0
   444  		PXOR ACC0, B0
   445  
   446  		MOVOU T1, ACC0
   447  		MOVOU T2, ACCM
   448  		MOVOU T1, ACC1
   449  
   450  		PSHUFD $78, B0, T0
   451  		PXOR B0, T0
   452  		PCLMULQDQ $0x00, B0, ACC0
   453  		PCLMULQDQ $0x11, B0, ACC1
   454  		PCLMULQDQ $0x00, T0, ACCM
   455  
   456  		PXOR ACC0, ACCM
   457  		PXOR ACC1, ACCM
   458  		MOVOU ACCM, T0
   459  		PSRLDQ $8, ACCM
   460  		PSLLDQ $8, T0
   461  		PXOR ACCM, ACC1
   462  		PXOR T0, ACC0
   463  
   464  		MOVOU POLY, T0
   465  		PCLMULQDQ $0x01, ACC0, T0
   466  		PSHUFD $78, ACC0, ACC0
   467  		PXOR T0, ACC0
   468  
   469  		MOVOU POLY, T0
   470  		PCLMULQDQ $0x01, ACC0, T0
   471  		PSHUFD $78, ACC0, ACC0
   472  		PXOR T0, ACC0
   473  		PXOR ACC1, ACC0
   474  
   475  		LEAQ 16(aut), aut
   476  
   477  	JMP dataSinglesLoop
   478  
   479  dataEnd:
   480  
   481  	TESTQ autLen, autLen
   482  	JEQ dataBail
   483  
   484  	PXOR B0, B0
   485  	LEAQ -1(aut)(autLen*1), aut
   486  
   487  dataLoadLoop:
   488  
   489  		PSLLDQ $1, B0
   490  		PINSRB $0, (aut), B0
   491  
   492  		LEAQ -1(aut), aut
   493  		DECQ autLen
   494  		JNE dataLoadLoop
   495  
   496  	JMP dataMul
   497  
   498  dataBail:
   499  	MOVOU ACC0, (tPtr)
   500  	RET
   501  #undef pTbl
   502  #undef aut
   503  #undef tPtr
   504  #undef autLen
   505  
   506  // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   507  TEXT ·gcmAesEnc(SB),0,$256-96
   508  #define pTbl DI
   509  #define ctx DX
   510  #define ctrPtr CX
   511  #define ptx SI
   512  #define ks AX
   513  #define tPtr R8
   514  #define ptxLen R9
   515  #define aluCTR R10
   516  #define aluTMP R11
   517  #define aluK R12
   518  #define NR R13
   519  
   520  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   521  #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   522  #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   523  #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   524  #define combinedRound(i) \
   525  	MOVOU (16*i)(ks), T0;\
   526  	AESENC T0, B0;\
   527  	AESENC T0, B1;\
   528  	AESENC T0, B2;\
   529  	AESENC T0, B3;\
   530  	 MOVOU (16*(i*2))(pTbl), T1;\
   531  	 MOVOU T1, T2;\
   532  	AESENC T0, B4;\
   533  	AESENC T0, B5;\
   534  	AESENC T0, B6;\
   535  	AESENC T0, B7;\
   536  	 MOVOU (16*i)(SP), T0;\
   537  	 PCLMULQDQ $0x00, T0, T1;\
   538  	 PXOR T1, ACC0;\
   539  	 PSHUFD $78, T0, T1;\
   540  	 PCLMULQDQ $0x11, T0, T2;\
   541  	 PXOR T1, T0;\
   542  	 PXOR T2, ACC1;\
   543  	 MOVOU (16*(i*2+1))(pTbl), T2;\
   544  	 PCLMULQDQ $0x00, T2, T0;\
   545  	 PXOR T0, ACCM
   546  #define mulRound(i) \
   547  	MOVOU (16*i)(SP), T0;\
   548  	MOVOU (16*(i*2))(pTbl), T1;\
   549  	MOVOU T1, T2;\
   550  	PCLMULQDQ $0x00, T0, T1;\
   551  	PXOR T1, ACC0;\
   552  	PCLMULQDQ $0x11, T0, T2;\
   553  	PXOR T2, ACC1;\
   554  	PSHUFD $78, T0, T1;\
   555  	PXOR T1, T0;\
   556  	MOVOU (16*(i*2+1))(pTbl), T1;\
   557  	PCLMULQDQ $0x00, T0, T1;\
   558  	PXOR T1, ACCM
   559  
   560  	MOVQ productTable+0(FP), pTbl
   561  	MOVQ dst+8(FP), ctx
   562  	MOVQ src_base+32(FP), ptx
   563  	MOVQ src_len+40(FP), ptxLen
   564  	MOVQ ctr+56(FP), ctrPtr
   565  	MOVQ T+64(FP), tPtr
   566  	MOVQ ks_base+72(FP), ks
   567  	MOVQ ks_len+80(FP), NR
   568  
   569  	SHRQ $2, NR
   570  	DECQ NR
   571  
   572  	MOVOU bswapMask<>(SB), BSWAP
   573  	MOVOU gcmPoly<>(SB), POLY
   574  
   575  	MOVOU (tPtr), ACC0
   576  	PXOR ACC1, ACC1
   577  	PXOR ACCM, ACCM
   578  	MOVOU (ctrPtr), B0
   579  	MOVL (3*4)(ctrPtr), aluCTR
   580  	MOVOU (ks), T0
   581  	MOVL (3*4)(ks), aluK
   582  	BSWAPL aluCTR
   583  	BSWAPL aluK
   584  
   585  	PXOR B0, T0
   586  	MOVOU T0, (8*16 + 0*16)(SP)
   587  	increment(0)
   588  
   589  	CMPQ ptxLen, $128
   590  	JB gcmAesEncSingles
   591  	SUBQ $128, ptxLen
   592  
   593  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   594  	MOVOU T0, (8*16 + 1*16)(SP)
   595  	increment(1)
   596  	MOVOU T0, (8*16 + 2*16)(SP)
   597  	increment(2)
   598  	MOVOU T0, (8*16 + 3*16)(SP)
   599  	increment(3)
   600  	MOVOU T0, (8*16 + 4*16)(SP)
   601  	increment(4)
   602  	MOVOU T0, (8*16 + 5*16)(SP)
   603  	increment(5)
   604  	MOVOU T0, (8*16 + 6*16)(SP)
   605  	increment(6)
   606  	MOVOU T0, (8*16 + 7*16)(SP)
   607  	increment(7)
   608  
   609  	MOVOU (8*16 + 0*16)(SP), B0
   610  	MOVOU (8*16 + 1*16)(SP), B1
   611  	MOVOU (8*16 + 2*16)(SP), B2
   612  	MOVOU (8*16 + 3*16)(SP), B3
   613  	MOVOU (8*16 + 4*16)(SP), B4
   614  	MOVOU (8*16 + 5*16)(SP), B5
   615  	MOVOU (8*16 + 6*16)(SP), B6
   616  	MOVOU (8*16 + 7*16)(SP), B7
   617  
   618  	aesRound(1)
   619  	increment(0)
   620  	aesRound(2)
   621  	increment(1)
   622  	aesRound(3)
   623  	increment(2)
   624  	aesRound(4)
   625  	increment(3)
   626  	aesRound(5)
   627  	increment(4)
   628  	aesRound(6)
   629  	increment(5)
   630  	aesRound(7)
   631  	increment(6)
   632  	aesRound(8)
   633  	increment(7)
   634  	aesRound(9)
   635  	MOVOU (16*10)(ks), T0
   636  	CMPQ NR, $12
   637  	JB encLast1
   638  	aesRnd(T0)
   639  	aesRound(11)
   640  	MOVOU (16*12)(ks), T0
   641  	JE encLast1
   642  	aesRnd(T0)
   643  	aesRound(13)
   644  	MOVOU (16*14)(ks), T0
   645  encLast1:
   646  	aesRndLast(T0)
   647  
   648  	MOVOU (16*0)(ptx), T0
   649  	PXOR T0, B0
   650  	MOVOU (16*1)(ptx), T0
   651  	PXOR T0, B1
   652  	MOVOU (16*2)(ptx), T0
   653  	PXOR T0, B2
   654  	MOVOU (16*3)(ptx), T0
   655  	PXOR T0, B3
   656  	MOVOU (16*4)(ptx), T0
   657  	PXOR T0, B4
   658  	MOVOU (16*5)(ptx), T0
   659  	PXOR T0, B5
   660  	MOVOU (16*6)(ptx), T0
   661  	PXOR T0, B6
   662  	MOVOU (16*7)(ptx), T0
   663  	PXOR T0, B7
   664  
   665  	MOVOU B0, (16*0)(ctx)
   666  	PSHUFB BSWAP, B0
   667  	PXOR ACC0, B0
   668  	MOVOU B1, (16*1)(ctx)
   669  	PSHUFB BSWAP, B1
   670  	MOVOU B2, (16*2)(ctx)
   671  	PSHUFB BSWAP, B2
   672  	MOVOU B3, (16*3)(ctx)
   673  	PSHUFB BSWAP, B3
   674  	MOVOU B4, (16*4)(ctx)
   675  	PSHUFB BSWAP, B4
   676  	MOVOU B5, (16*5)(ctx)
   677  	PSHUFB BSWAP, B5
   678  	MOVOU B6, (16*6)(ctx)
   679  	PSHUFB BSWAP, B6
   680  	MOVOU B7, (16*7)(ctx)
   681  	PSHUFB BSWAP, B7
   682  
   683  	MOVOU B0, (16*0)(SP)
   684  	MOVOU B1, (16*1)(SP)
   685  	MOVOU B2, (16*2)(SP)
   686  	MOVOU B3, (16*3)(SP)
   687  	MOVOU B4, (16*4)(SP)
   688  	MOVOU B5, (16*5)(SP)
   689  	MOVOU B6, (16*6)(SP)
   690  	MOVOU B7, (16*7)(SP)
   691  
   692  	LEAQ 128(ptx), ptx
   693  	LEAQ 128(ctx), ctx
   694  
   695  gcmAesEncOctetsLoop:
   696  
   697  		CMPQ ptxLen, $128
   698  		JB gcmAesEncOctetsEnd
   699  		SUBQ $128, ptxLen
   700  
   701  		MOVOU (8*16 + 0*16)(SP), B0
   702  		MOVOU (8*16 + 1*16)(SP), B1
   703  		MOVOU (8*16 + 2*16)(SP), B2
   704  		MOVOU (8*16 + 3*16)(SP), B3
   705  		MOVOU (8*16 + 4*16)(SP), B4
   706  		MOVOU (8*16 + 5*16)(SP), B5
   707  		MOVOU (8*16 + 6*16)(SP), B6
   708  		MOVOU (8*16 + 7*16)(SP), B7
   709  
   710  		MOVOU (16*0)(SP), T0
   711  		PSHUFD $78, T0, T1
   712  		PXOR T0, T1
   713  
   714  		MOVOU (16*0)(pTbl), ACC0
   715  		MOVOU (16*1)(pTbl), ACCM
   716  		MOVOU ACC0, ACC1
   717  
   718  		PCLMULQDQ $0x00, T1, ACCM
   719  		PCLMULQDQ $0x00, T0, ACC0
   720  		PCLMULQDQ $0x11, T0, ACC1
   721  
   722  		combinedRound(1)
   723  		increment(0)
   724  		combinedRound(2)
   725  		increment(1)
   726  		combinedRound(3)
   727  		increment(2)
   728  		combinedRound(4)
   729  		increment(3)
   730  		combinedRound(5)
   731  		increment(4)
   732  		combinedRound(6)
   733  		increment(5)
   734  		combinedRound(7)
   735  		increment(6)
   736  
   737  		aesRound(8)
   738  		increment(7)
   739  
   740  		PXOR ACC0, ACCM
   741  		PXOR ACC1, ACCM
   742  		MOVOU ACCM, T0
   743  		PSRLDQ $8, ACCM
   744  		PSLLDQ $8, T0
   745  		PXOR ACCM, ACC1
   746  		PXOR T0, ACC0
   747  
   748  		reduceRound(ACC0)
   749  		aesRound(9)
   750  
   751  		reduceRound(ACC0)
   752  		PXOR ACC1, ACC0
   753  
   754  		MOVOU (16*10)(ks), T0
   755  		CMPQ NR, $12
   756  		JB encLast2
   757  		aesRnd(T0)
   758  		aesRound(11)
   759  		MOVOU (16*12)(ks), T0
   760  		JE encLast2
   761  		aesRnd(T0)
   762  		aesRound(13)
   763  		MOVOU (16*14)(ks), T0
   764  encLast2:
   765  		aesRndLast(T0)
   766  
   767  		MOVOU (16*0)(ptx), T0
   768  		PXOR T0, B0
   769  		MOVOU (16*1)(ptx), T0
   770  		PXOR T0, B1
   771  		MOVOU (16*2)(ptx), T0
   772  		PXOR T0, B2
   773  		MOVOU (16*3)(ptx), T0
   774  		PXOR T0, B3
   775  		MOVOU (16*4)(ptx), T0
   776  		PXOR T0, B4
   777  		MOVOU (16*5)(ptx), T0
   778  		PXOR T0, B5
   779  		MOVOU (16*6)(ptx), T0
   780  		PXOR T0, B6
   781  		MOVOU (16*7)(ptx), T0
   782  		PXOR T0, B7
   783  
   784  		MOVOU B0, (16*0)(ctx)
   785  		PSHUFB BSWAP, B0
   786  		PXOR ACC0, B0
   787  		MOVOU B1, (16*1)(ctx)
   788  		PSHUFB BSWAP, B1
   789  		MOVOU B2, (16*2)(ctx)
   790  		PSHUFB BSWAP, B2
   791  		MOVOU B3, (16*3)(ctx)
   792  		PSHUFB BSWAP, B3
   793  		MOVOU B4, (16*4)(ctx)
   794  		PSHUFB BSWAP, B4
   795  		MOVOU B5, (16*5)(ctx)
   796  		PSHUFB BSWAP, B5
   797  		MOVOU B6, (16*6)(ctx)
   798  		PSHUFB BSWAP, B6
   799  		MOVOU B7, (16*7)(ctx)
   800  		PSHUFB BSWAP, B7
   801  
   802  		MOVOU B0, (16*0)(SP)
   803  		MOVOU B1, (16*1)(SP)
   804  		MOVOU B2, (16*2)(SP)
   805  		MOVOU B3, (16*3)(SP)
   806  		MOVOU B4, (16*4)(SP)
   807  		MOVOU B5, (16*5)(SP)
   808  		MOVOU B6, (16*6)(SP)
   809  		MOVOU B7, (16*7)(SP)
   810  
   811  		LEAQ 128(ptx), ptx
   812  		LEAQ 128(ctx), ctx
   813  
   814  		JMP gcmAesEncOctetsLoop
   815  
   816  gcmAesEncOctetsEnd:
   817  
   818  	MOVOU (16*0)(SP), T0
   819  	MOVOU (16*0)(pTbl), ACC0
   820  	MOVOU (16*1)(pTbl), ACCM
   821  	MOVOU ACC0, ACC1
   822  	PSHUFD $78, T0, T1
   823  	PXOR T0, T1
   824  	PCLMULQDQ $0x00, T0, ACC0
   825  	PCLMULQDQ $0x11, T0, ACC1
   826  	PCLMULQDQ $0x00, T1, ACCM
   827  
   828  	mulRound(1)
   829  	mulRound(2)
   830  	mulRound(3)
   831  	mulRound(4)
   832  	mulRound(5)
   833  	mulRound(6)
   834  	mulRound(7)
   835  
   836  	PXOR ACC0, ACCM
   837  	PXOR ACC1, ACCM
   838  	MOVOU ACCM, T0
   839  	PSRLDQ $8, ACCM
   840  	PSLLDQ $8, T0
   841  	PXOR ACCM, ACC1
   842  	PXOR T0, ACC0
   843  
   844  	reduceRound(ACC0)
   845  	reduceRound(ACC0)
   846  	PXOR ACC1, ACC0
   847  
   848  	TESTQ ptxLen, ptxLen
   849  	JE gcmAesEncDone
   850  
   851  	SUBQ $7, aluCTR
   852  
   853  gcmAesEncSingles:
   854  
   855  	MOVOU (16*1)(ks), B1
   856  	MOVOU (16*2)(ks), B2
   857  	MOVOU (16*3)(ks), B3
   858  	MOVOU (16*4)(ks), B4
   859  	MOVOU (16*5)(ks), B5
   860  	MOVOU (16*6)(ks), B6
   861  	MOVOU (16*7)(ks), B7
   862  
   863  	MOVOU (16*14)(pTbl), T2
   864  
   865  gcmAesEncSinglesLoop:
   866  
   867  		CMPQ ptxLen, $16
   868  		JB gcmAesEncTail
   869  		SUBQ $16, ptxLen
   870  
   871  		MOVOU (8*16 + 0*16)(SP), B0
   872  		increment(0)
   873  
   874  		AESENC B1, B0
   875  		AESENC B2, B0
   876  		AESENC B3, B0
   877  		AESENC B4, B0
   878  		AESENC B5, B0
   879  		AESENC B6, B0
   880  		AESENC B7, B0
   881  		MOVOU (16*8)(ks), T0
   882  		AESENC T0, B0
   883  		MOVOU (16*9)(ks), T0
   884  		AESENC T0, B0
   885  		MOVOU (16*10)(ks), T0
   886  		CMPQ NR, $12
   887  		JB encLast3
   888  		AESENC T0, B0
   889  		MOVOU (16*11)(ks), T0
   890  		AESENC T0, B0
   891  		MOVOU (16*12)(ks), T0
   892  		JE encLast3
   893  		AESENC T0, B0
   894  		MOVOU (16*13)(ks), T0
   895  		AESENC T0, B0
   896  		MOVOU (16*14)(ks), T0
   897  encLast3:
   898  		AESENCLAST T0, B0
   899  
   900  		MOVOU (ptx), T0
   901  		PXOR T0, B0
   902  		MOVOU B0, (ctx)
   903  
   904  		PSHUFB BSWAP, B0
   905  		PXOR ACC0, B0
   906  
   907  		MOVOU T2, ACC0
   908  		MOVOU T2, ACC1
   909  		MOVOU (16*15)(pTbl), ACCM
   910  
   911  		PSHUFD $78, B0, T0
   912  		PXOR B0, T0
   913  		PCLMULQDQ $0x00, B0, ACC0
   914  		PCLMULQDQ $0x11, B0, ACC1
   915  		PCLMULQDQ $0x00, T0, ACCM
   916  
   917  		PXOR ACC0, ACCM
   918  		PXOR ACC1, ACCM
   919  		MOVOU ACCM, T0
   920  		PSRLDQ $8, ACCM
   921  		PSLLDQ $8, T0
   922  		PXOR ACCM, ACC1
   923  		PXOR T0, ACC0
   924  
   925  		reduceRound(ACC0)
   926  		reduceRound(ACC0)
   927  		PXOR ACC1, ACC0
   928  
   929  		LEAQ (16*1)(ptx), ptx
   930  		LEAQ (16*1)(ctx), ctx
   931  
   932  	JMP gcmAesEncSinglesLoop
   933  
   934  gcmAesEncTail:
   935  	TESTQ ptxLen, ptxLen
   936  	JE gcmAesEncDone
   937  
   938  	MOVOU (8*16 + 0*16)(SP), B0
   939  	AESENC B1, B0
   940  	AESENC B2, B0
   941  	AESENC B3, B0
   942  	AESENC B4, B0
   943  	AESENC B5, B0
   944  	AESENC B6, B0
   945  	AESENC B7, B0
   946  	MOVOU (16*8)(ks), T0
   947  	AESENC T0, B0
   948  	MOVOU (16*9)(ks), T0
   949  	AESENC T0, B0
   950  	MOVOU (16*10)(ks), T0
   951  	CMPQ NR, $12
   952  	JB encLast4
   953  	AESENC T0, B0
   954  	MOVOU (16*11)(ks), T0
   955  	AESENC T0, B0
   956  	MOVOU (16*12)(ks), T0
   957  	JE encLast4
   958  	AESENC T0, B0
   959  	MOVOU (16*13)(ks), T0
   960  	AESENC T0, B0
   961  	MOVOU (16*14)(ks), T0
   962  encLast4:
   963  	AESENCLAST T0, B0
   964  	MOVOU B0, T0
   965  
   966  	LEAQ -1(ptx)(ptxLen*1), ptx
   967  
   968  	MOVQ ptxLen, aluTMP
   969  	SHLQ $4, aluTMP
   970  
   971  	LEAQ andMask<>(SB), aluCTR
   972  	MOVOU -16(aluCTR)(aluTMP*1), T1
   973  
   974  	PXOR B0, B0
   975  ptxLoadLoop:
   976  		PSLLDQ $1, B0
   977  		PINSRB $0, (ptx), B0
   978  		LEAQ -1(ptx), ptx
   979  		DECQ ptxLen
   980  	JNE ptxLoadLoop
   981  
   982  	PXOR T0, B0
   983  	PAND T1, B0
   984  	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   985  
   986  	PSHUFB BSWAP, B0
   987  	PXOR ACC0, B0
   988  
   989  	MOVOU T2, ACC0
   990  	MOVOU T2, ACC1
   991  	MOVOU (16*15)(pTbl), ACCM
   992  
   993  	PSHUFD $78, B0, T0
   994  	PXOR B0, T0
   995  	PCLMULQDQ $0x00, B0, ACC0
   996  	PCLMULQDQ $0x11, B0, ACC1
   997  	PCLMULQDQ $0x00, T0, ACCM
   998  
   999  	PXOR ACC0, ACCM
  1000  	PXOR ACC1, ACCM
  1001  	MOVOU ACCM, T0
  1002  	PSRLDQ $8, ACCM
  1003  	PSLLDQ $8, T0
  1004  	PXOR ACCM, ACC1
  1005  	PXOR T0, ACC0
  1006  
  1007  	reduceRound(ACC0)
  1008  	reduceRound(ACC0)
  1009  	PXOR ACC1, ACC0
  1010  
  1011  gcmAesEncDone:
  1012  	MOVOU ACC0, (tPtr)
  1013  	RET
  1014  #undef increment
  1015  
  1016  // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
  1017  TEXT ·gcmAesDec(SB),0,$128-96
  1018  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
  1019  #define combinedDecRound(i) \
  1020  	MOVOU (16*i)(ks), T0;\
  1021  	AESENC T0, B0;\
  1022  	AESENC T0, B1;\
  1023  	AESENC T0, B2;\
  1024  	AESENC T0, B3;\
  1025  	MOVOU (16*(i*2))(pTbl), T1;\
  1026  	MOVOU T1, T2;\
  1027  	AESENC T0, B4;\
  1028  	AESENC T0, B5;\
  1029  	AESENC T0, B6;\
  1030  	AESENC T0, B7;\
  1031  	MOVOU (16*i)(ctx), T0;\
  1032  	PSHUFB BSWAP, T0;\
  1033  	PCLMULQDQ $0x00, T0, T1;\
  1034  	PXOR T1, ACC0;\
  1035  	PSHUFD $78, T0, T1;\
  1036  	PCLMULQDQ $0x11, T0, T2;\
  1037  	PXOR T1, T0;\
  1038  	PXOR T2, ACC1;\
  1039  	MOVOU (16*(i*2+1))(pTbl), T2;\
  1040  	PCLMULQDQ $0x00, T2, T0;\
  1041  	PXOR T0, ACCM
  1042  
  1043  	MOVQ productTable+0(FP), pTbl
  1044  	MOVQ dst+8(FP), ptx
  1045  	MOVQ src_base+32(FP), ctx
  1046  	MOVQ src_len+40(FP), ptxLen
  1047  	MOVQ ctr+56(FP), ctrPtr
  1048  	MOVQ T+64(FP), tPtr
  1049  	MOVQ ks_base+72(FP), ks
  1050  	MOVQ ks_len+80(FP), NR
  1051  
  1052  	SHRQ $2, NR
  1053  	DECQ NR
  1054  
  1055  	MOVOU bswapMask<>(SB), BSWAP
  1056  	MOVOU gcmPoly<>(SB), POLY
  1057  
  1058  	MOVOU (tPtr), ACC0
  1059  	PXOR ACC1, ACC1
  1060  	PXOR ACCM, ACCM
  1061  	MOVOU (ctrPtr), B0
  1062  	MOVL (3*4)(ctrPtr), aluCTR
  1063  	MOVOU (ks), T0
  1064  	MOVL (3*4)(ks), aluK
  1065  	BSWAPL aluCTR
  1066  	BSWAPL aluK
  1067  
  1068  	PXOR B0, T0
  1069  	MOVOU T0, (0*16)(SP)
  1070  	increment(0)
  1071  
  1072  	CMPQ ptxLen, $128
  1073  	JB gcmAesDecSingles
  1074  
  1075  	MOVOU T0, (1*16)(SP)
  1076  	increment(1)
  1077  	MOVOU T0, (2*16)(SP)
  1078  	increment(2)
  1079  	MOVOU T0, (3*16)(SP)
  1080  	increment(3)
  1081  	MOVOU T0, (4*16)(SP)
  1082  	increment(4)
  1083  	MOVOU T0, (5*16)(SP)
  1084  	increment(5)
  1085  	MOVOU T0, (6*16)(SP)
  1086  	increment(6)
  1087  	MOVOU T0, (7*16)(SP)
  1088  	increment(7)
  1089  
  1090  gcmAesDecOctetsLoop:
  1091  
  1092  		CMPQ ptxLen, $128
  1093  		JB gcmAesDecEndOctets
  1094  		SUBQ $128, ptxLen
  1095  
  1096  		MOVOU (0*16)(SP), B0
  1097  		MOVOU (1*16)(SP), B1
  1098  		MOVOU (2*16)(SP), B2
  1099  		MOVOU (3*16)(SP), B3
  1100  		MOVOU (4*16)(SP), B4
  1101  		MOVOU (5*16)(SP), B5
  1102  		MOVOU (6*16)(SP), B6
  1103  		MOVOU (7*16)(SP), B7
  1104  
  1105  		MOVOU (16*0)(ctx), T0
  1106  		PSHUFB BSWAP, T0
  1107  		PXOR ACC0, T0
  1108  		PSHUFD $78, T0, T1
  1109  		PXOR T0, T1
  1110  
  1111  		MOVOU (16*0)(pTbl), ACC0
  1112  		MOVOU (16*1)(pTbl), ACCM
  1113  		MOVOU ACC0, ACC1
  1114  
  1115  		PCLMULQDQ $0x00, T1, ACCM
  1116  		PCLMULQDQ $0x00, T0, ACC0
  1117  		PCLMULQDQ $0x11, T0, ACC1
  1118  
  1119  		combinedDecRound(1)
  1120  		increment(0)
  1121  		combinedDecRound(2)
  1122  		increment(1)
  1123  		combinedDecRound(3)
  1124  		increment(2)
  1125  		combinedDecRound(4)
  1126  		increment(3)
  1127  		combinedDecRound(5)
  1128  		increment(4)
  1129  		combinedDecRound(6)
  1130  		increment(5)
  1131  		combinedDecRound(7)
  1132  		increment(6)
  1133  
  1134  		aesRound(8)
  1135  		increment(7)
  1136  
  1137  		PXOR ACC0, ACCM
  1138  		PXOR ACC1, ACCM
  1139  		MOVOU ACCM, T0
  1140  		PSRLDQ $8, ACCM
  1141  		PSLLDQ $8, T0
  1142  		PXOR ACCM, ACC1
  1143  		PXOR T0, ACC0
  1144  
  1145  		reduceRound(ACC0)
  1146  		aesRound(9)
  1147  
  1148  		reduceRound(ACC0)
  1149  		PXOR ACC1, ACC0
  1150  
  1151  		MOVOU (16*10)(ks), T0
  1152  		CMPQ NR, $12
  1153  		JB decLast1
  1154  		aesRnd(T0)
  1155  		aesRound(11)
  1156  		MOVOU (16*12)(ks), T0
  1157  		JE decLast1
  1158  		aesRnd(T0)
  1159  		aesRound(13)
  1160  		MOVOU (16*14)(ks), T0
  1161  decLast1:
  1162  		aesRndLast(T0)
  1163  
  1164  		MOVOU (16*0)(ctx), T0
  1165  		PXOR T0, B0
  1166  		MOVOU (16*1)(ctx), T0
  1167  		PXOR T0, B1
  1168  		MOVOU (16*2)(ctx), T0
  1169  		PXOR T0, B2
  1170  		MOVOU (16*3)(ctx), T0
  1171  		PXOR T0, B3
  1172  		MOVOU (16*4)(ctx), T0
  1173  		PXOR T0, B4
  1174  		MOVOU (16*5)(ctx), T0
  1175  		PXOR T0, B5
  1176  		MOVOU (16*6)(ctx), T0
  1177  		PXOR T0, B6
  1178  		MOVOU (16*7)(ctx), T0
  1179  		PXOR T0, B7
  1180  
  1181  		MOVOU B0, (16*0)(ptx)
  1182  		MOVOU B1, (16*1)(ptx)
  1183  		MOVOU B2, (16*2)(ptx)
  1184  		MOVOU B3, (16*3)(ptx)
  1185  		MOVOU B4, (16*4)(ptx)
  1186  		MOVOU B5, (16*5)(ptx)
  1187  		MOVOU B6, (16*6)(ptx)
  1188  		MOVOU B7, (16*7)(ptx)
  1189  
  1190  		LEAQ 128(ptx), ptx
  1191  		LEAQ 128(ctx), ctx
  1192  
  1193  		JMP gcmAesDecOctetsLoop
  1194  
  1195  gcmAesDecEndOctets:
  1196  
  1197  	SUBQ $7, aluCTR
  1198  
  1199  gcmAesDecSingles:
  1200  
  1201  	MOVOU (16*1)(ks), B1
  1202  	MOVOU (16*2)(ks), B2
  1203  	MOVOU (16*3)(ks), B3
  1204  	MOVOU (16*4)(ks), B4
  1205  	MOVOU (16*5)(ks), B5
  1206  	MOVOU (16*6)(ks), B6
  1207  	MOVOU (16*7)(ks), B7
  1208  
  1209  	MOVOU (16*14)(pTbl), T2
  1210  
  1211  gcmAesDecSinglesLoop:
  1212  
  1213  		CMPQ ptxLen, $16
  1214  		JB gcmAesDecTail
  1215  		SUBQ $16, ptxLen
  1216  
  1217  		MOVOU (ctx), B0
  1218  		MOVOU B0, T1
  1219  		PSHUFB BSWAP, B0
  1220  		PXOR ACC0, B0
  1221  
  1222  		MOVOU T2, ACC0
  1223  		MOVOU T2, ACC1
  1224  		MOVOU (16*15)(pTbl), ACCM
  1225  
  1226  		PCLMULQDQ $0x00, B0, ACC0
  1227  		PCLMULQDQ $0x11, B0, ACC1
  1228  		PSHUFD $78, B0, T0
  1229  		PXOR B0, T0
  1230  		PCLMULQDQ $0x00, T0, ACCM
  1231  
  1232  		PXOR ACC0, ACCM
  1233  		PXOR ACC1, ACCM
  1234  		MOVOU ACCM, T0
  1235  		PSRLDQ $8, ACCM
  1236  		PSLLDQ $8, T0
  1237  		PXOR ACCM, ACC1
  1238  		PXOR T0, ACC0
  1239  
  1240  		reduceRound(ACC0)
  1241  		reduceRound(ACC0)
  1242  		PXOR ACC1, ACC0
  1243  
  1244  		MOVOU (0*16)(SP), B0
  1245  		increment(0)
  1246  		AESENC B1, B0
  1247  		AESENC B2, B0
  1248  		AESENC B3, B0
  1249  		AESENC B4, B0
  1250  		AESENC B5, B0
  1251  		AESENC B6, B0
  1252  		AESENC B7, B0
  1253  		MOVOU (16*8)(ks), T0
  1254  		AESENC T0, B0
  1255  		MOVOU (16*9)(ks), T0
  1256  		AESENC T0, B0
  1257  		MOVOU (16*10)(ks), T0
  1258  		CMPQ NR, $12
  1259  		JB decLast2
  1260  		AESENC T0, B0
  1261  		MOVOU (16*11)(ks), T0
  1262  		AESENC T0, B0
  1263  		MOVOU (16*12)(ks), T0
  1264  		JE decLast2
  1265  		AESENC T0, B0
  1266  		MOVOU (16*13)(ks), T0
  1267  		AESENC T0, B0
  1268  		MOVOU (16*14)(ks), T0
  1269  decLast2:
  1270  		AESENCLAST T0, B0
  1271  
  1272  		PXOR T1, B0
  1273  		MOVOU B0, (ptx)
  1274  
  1275  		LEAQ (16*1)(ptx), ptx
  1276  		LEAQ (16*1)(ctx), ctx
  1277  
  1278  	JMP gcmAesDecSinglesLoop
  1279  
  1280  gcmAesDecTail:
  1281  
  1282  	TESTQ ptxLen, ptxLen
  1283  	JE gcmAesDecDone
  1284  
  1285  	MOVQ ptxLen, aluTMP
  1286  	SHLQ $4, aluTMP
  1287  	LEAQ andMask<>(SB), aluCTR
  1288  	MOVOU -16(aluCTR)(aluTMP*1), T1
  1289  
  1290  	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1291  	PAND T1, B0
  1292  
  1293  	MOVOU B0, T1
  1294  	PSHUFB BSWAP, B0
  1295  	PXOR ACC0, B0
  1296  
  1297  	MOVOU (16*14)(pTbl), ACC0
  1298  	MOVOU (16*15)(pTbl), ACCM
  1299  	MOVOU ACC0, ACC1
  1300  
  1301  	PCLMULQDQ $0x00, B0, ACC0
  1302  	PCLMULQDQ $0x11, B0, ACC1
  1303  	PSHUFD $78, B0, T0
  1304  	PXOR B0, T0
  1305  	PCLMULQDQ $0x00, T0, ACCM
  1306  
  1307  	PXOR ACC0, ACCM
  1308  	PXOR ACC1, ACCM
  1309  	MOVOU ACCM, T0
  1310  	PSRLDQ $8, ACCM
  1311  	PSLLDQ $8, T0
  1312  	PXOR ACCM, ACC1
  1313  	PXOR T0, ACC0
  1314  
  1315  	reduceRound(ACC0)
  1316  	reduceRound(ACC0)
  1317  	PXOR ACC1, ACC0
  1318  
  1319  	MOVOU (0*16)(SP), B0
  1320  	increment(0)
  1321  	AESENC B1, B0
  1322  	AESENC B2, B0
  1323  	AESENC B3, B0
  1324  	AESENC B4, B0
  1325  	AESENC B5, B0
  1326  	AESENC B6, B0
  1327  	AESENC B7, B0
  1328  	MOVOU (16*8)(ks), T0
  1329  	AESENC T0, B0
  1330  	MOVOU (16*9)(ks), T0
  1331  	AESENC T0, B0
  1332  	MOVOU (16*10)(ks), T0
  1333  	CMPQ NR, $12
  1334  	JB decLast3
  1335  	AESENC T0, B0
  1336  	MOVOU (16*11)(ks), T0
  1337  	AESENC T0, B0
  1338  	MOVOU (16*12)(ks), T0
  1339  	JE decLast3
  1340  	AESENC T0, B0
  1341  	MOVOU (16*13)(ks), T0
  1342  	AESENC T0, B0
  1343  	MOVOU (16*14)(ks), T0
  1344  decLast3:
  1345  	AESENCLAST T0, B0
  1346  	PXOR T1, B0
  1347  
  1348  ptxStoreLoop:
  1349  		PEXTRB $0, B0, (ptx)
  1350  		PSRLDQ $1, B0
  1351  		LEAQ 1(ptx), ptx
  1352  		DECQ ptxLen
  1353  
  1354  	JNE ptxStoreLoop
  1355  
  1356  gcmAesDecDone:
  1357  
  1358  	MOVOU ACC0, (tPtr)
  1359  	RET