github.com/MangoDowner/go-gm@v0.0.0-20180818020936-8baa2bd4408c/src/crypto/aes/gcm_amd64.s (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     6  // The implementation uses some optimization as described in:
     7  // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     8  //     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     9  // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    10  //     Hardware
    11  
    12  #include "textflag.h"
    13  
    14  #define B0 X0
    15  #define B1 X1
    16  #define B2 X2
    17  #define B3 X3
    18  #define B4 X4
    19  #define B5 X5
    20  #define B6 X6
    21  #define B7 X7
    22  
    23  #define ACC0 X8
    24  #define ACC1 X9
    25  #define ACCM X10
    26  
    27  #define T0 X11
    28  #define T1 X12
    29  #define T2 X13
    30  #define POLY X14
    31  #define BSWAP X15
    32  
    33  DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    34  DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    35  
    36  DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    37  DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    38  
    39  DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    40  DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    41  DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    42  DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    43  DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    44  DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    45  DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    46  DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    47  DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    48  DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    49  DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    50  DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    51  DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    52  DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    53  DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    54  DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    55  DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    56  DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    57  DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    58  DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    59  DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    60  DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    61  DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    62  DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    63  DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    64  DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    65  DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    66  DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    67  DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    68  DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    69  
    70  GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    71  GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    72  GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    73  
    74  // func hasGCMAsm() bool
    75  // returns whether AES-NI AND CLMUL-NI are supported
    76  TEXT ·hasGCMAsm(SB),NOSPLIT,$0
    77  	XORQ AX, AX
    78  	INCL AX
    79  	CPUID
    80  	MOVQ CX, DX
    81  	SHRQ $25, CX
    82  	SHRQ $1, DX
    83  	ANDQ DX, CX
    84  	ANDQ $1, CX
    85  	MOVB CX, ret+0(FP)
    86  	RET
    87  
    88  // func aesEncBlock(dst, src *[16]byte, ks []uint32)
    89  TEXT ·aesEncBlock(SB),NOSPLIT,$0
    90  	MOVQ dst+0(FP), DI
    91  	MOVQ src+8(FP), SI
    92  	MOVQ ks_base+16(FP), DX
    93  	MOVQ ks_len+24(FP), CX
    94  
    95  	SHRQ $2, CX
    96  	DECQ CX
    97  
    98  	MOVOU (SI), X0
    99  	MOVOU (16*0)(DX), X1
   100  	PXOR X1, X0
   101  	MOVOU (16*1)(DX), X1
   102  	AESENC X1, X0
   103  	MOVOU (16*2)(DX), X1
   104  	AESENC X1, X0
   105  	MOVOU (16*3)(DX), X1
   106  	AESENC X1, X0
   107  	MOVOU (16*4)(DX), X1
   108  	AESENC X1, X0
   109  	MOVOU (16*5)(DX), X1
   110  	AESENC X1, X0
   111  	MOVOU (16*6)(DX), X1
   112  	AESENC X1, X0
   113  	MOVOU (16*7)(DX), X1
   114  	AESENC X1, X0
   115  	MOVOU (16*8)(DX), X1
   116  	AESENC X1, X0
   117  	MOVOU (16*9)(DX), X1
   118  	AESENC X1, X0
   119  	MOVOU (16*10)(DX), X1
   120  	CMPQ CX, $12
   121  	JB encLast
   122  	AESENC X1, X0
   123  	MOVOU (16*11)(DX), X1
   124  	AESENC X1, X0
   125  	MOVOU (16*12)(DX), X1
   126  	JE encLast
   127  	AESENC X1, X0
   128  	MOVOU (16*13)(DX), X1
   129  	AESENC X1, X0
   130  	MOVOU (16*14)(DX), X1
   131  
   132  encLast:
   133  	AESENCLAST X1, X0
   134  	MOVOU X0, (DI)
   135  
   136  	RET
   137  
   138  // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
   139  TEXT ·gcmAesFinish(SB),NOSPLIT,$0
   140  #define pTbl DI
   141  #define tMsk SI
   142  #define tPtr DX
   143  #define plen AX
   144  #define dlen CX
   145  
   146  	MOVQ productTable+0(FP), pTbl
   147  	MOVQ tagMask+8(FP), tMsk
   148  	MOVQ T+16(FP), tPtr
   149  	MOVQ pLen+24(FP), plen
   150  	MOVQ dLen+32(FP), dlen
   151  
   152  	MOVOU (tPtr), ACC0
   153  	MOVOU (tMsk), T2
   154  
   155  	MOVOU bswapMask<>(SB), BSWAP
   156  	MOVOU gcmPoly<>(SB), POLY
   157  
   158  	SHLQ $3, plen
   159  	SHLQ $3, dlen
   160  
   161  	MOVQ plen, B0
   162  	PINSRQ $1, dlen, B0
   163  
   164  	PXOR ACC0, B0
   165  
   166  	MOVOU (16*14)(pTbl), ACC0
   167  	MOVOU (16*15)(pTbl), ACCM
   168  	MOVOU ACC0, ACC1
   169  
   170  	PCLMULQDQ $0x00, B0, ACC0
   171  	PCLMULQDQ $0x11, B0, ACC1
   172  	PSHUFD $78, B0, T0
   173  	PXOR B0, T0
   174  	PCLMULQDQ $0x00, T0, ACCM
   175  
   176  	PXOR ACC0, ACCM
   177  	PXOR ACC1, ACCM
   178  	MOVOU ACCM, T0
   179  	PSRLDQ $8, ACCM
   180  	PSLLDQ $8, T0
   181  	PXOR ACCM, ACC1
   182  	PXOR T0, ACC0
   183  
   184  	MOVOU POLY, T0
   185  	PCLMULQDQ $0x01, ACC0, T0
   186  	PSHUFD $78, ACC0, ACC0
   187  	PXOR T0, ACC0
   188  
   189  	MOVOU POLY, T0
   190  	PCLMULQDQ $0x01, ACC0, T0
   191  	PSHUFD $78, ACC0, ACC0
   192  	PXOR T0, ACC0
   193  
   194  	PXOR ACC1, ACC0
   195  
   196  	PSHUFB BSWAP, ACC0
   197  	PXOR T2, ACC0
   198  	MOVOU ACC0, (tPtr)
   199  
   200  	RET
   201  #undef pTbl
   202  #undef tMsk
   203  #undef tPtr
   204  #undef plen
   205  #undef dlen
   206  
   207  // func gcmAesInit(productTable *[256]byte, ks []uint32)
   208  TEXT ·gcmAesInit(SB),NOSPLIT,$0
   209  #define dst DI
   210  #define KS SI
   211  #define NR DX
   212  
   213  	MOVQ productTable+0(FP), dst
   214  	MOVQ ks_base+8(FP), KS
   215  	MOVQ ks_len+16(FP), NR
   216  
   217  	SHRQ $2, NR
   218  	DECQ NR
   219  
   220  	MOVOU bswapMask<>(SB), BSWAP
   221  	MOVOU gcmPoly<>(SB), POLY
   222  
   223  	// Encrypt block 0, with the AES key to generate the hash key H
   224  	MOVOU (16*0)(KS), B0
   225  	MOVOU (16*1)(KS), T0
   226  	AESENC T0, B0
   227  	MOVOU (16*2)(KS), T0
   228  	AESENC T0, B0
   229  	MOVOU (16*3)(KS), T0
   230  	AESENC T0, B0
   231  	MOVOU (16*4)(KS), T0
   232  	AESENC T0, B0
   233  	MOVOU (16*5)(KS), T0
   234  	AESENC T0, B0
   235  	MOVOU (16*6)(KS), T0
   236  	AESENC T0, B0
   237  	MOVOU (16*7)(KS), T0
   238  	AESENC T0, B0
   239  	MOVOU (16*8)(KS), T0
   240  	AESENC T0, B0
   241  	MOVOU (16*9)(KS), T0
   242  	AESENC T0, B0
   243  	MOVOU (16*10)(KS), T0
   244  	CMPQ NR, $12
   245  	JB initEncLast
   246  	AESENC T0, B0
   247  	MOVOU (16*11)(KS), T0
   248  	AESENC T0, B0
   249  	MOVOU (16*12)(KS), T0
   250  	JE initEncLast
   251  	AESENC T0, B0
   252  	MOVOU (16*13)(KS), T0
   253  	AESENC T0, B0
   254  	MOVOU (16*14)(KS), T0
   255  initEncLast:
   256  	AESENCLAST T0, B0
   257  
   258  	PSHUFB BSWAP, B0
   259  	// H * 2
   260  	PSHUFD $0xff, B0, T0
   261  	MOVOU B0, T1
   262  	PSRAL $31, T0
   263  	PAND POLY, T0
   264  	PSRLL $31, T1
   265  	PSLLDQ $4, T1
   266  	PSLLL $1, B0
   267  	PXOR T0, B0
   268  	PXOR T1, B0
   269  	// Karatsuba pre-computations
   270  	MOVOU B0, (16*14)(dst)
   271  	PSHUFD $78, B0, B1
   272  	PXOR B0, B1
   273  	MOVOU B1, (16*15)(dst)
   274  
   275  	MOVOU B0, B2
   276  	MOVOU B1, B3
   277  	// Now prepare powers of H and pre-computations for them
   278  	MOVQ $7, AX
   279  
   280  initLoop:
   281  		MOVOU B2, T0
   282  		MOVOU B2, T1
   283  		MOVOU B3, T2
   284  		PCLMULQDQ $0x00, B0, T0
   285  		PCLMULQDQ $0x11, B0, T1
   286  		PCLMULQDQ $0x00, B1, T2
   287  
   288  		PXOR T0, T2
   289  		PXOR T1, T2
   290  		MOVOU T2, B4
   291  		PSLLDQ $8, B4
   292  		PSRLDQ $8, T2
   293  		PXOR B4, T0
   294  		PXOR T2, T1
   295  
   296  		MOVOU POLY, B2
   297  		PCLMULQDQ $0x01, T0, B2
   298  		PSHUFD $78, T0, T0
   299  		PXOR B2, T0
   300  		MOVOU POLY, B2
   301  		PCLMULQDQ $0x01, T0, B2
   302  		PSHUFD $78, T0, T0
   303  		PXOR T0, B2
   304  		PXOR T1, B2
   305  
   306  		MOVOU B2, (16*12)(dst)
   307  		PSHUFD $78, B2, B3
   308  		PXOR B2, B3
   309  		MOVOU B3, (16*13)(dst)
   310  
   311  		DECQ AX
   312  		LEAQ (-16*2)(dst), dst
   313  	JNE initLoop
   314  
   315  	RET
   316  #undef NR
   317  #undef KS
   318  #undef dst
   319  
   320  // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   321  TEXT ·gcmAesData(SB),NOSPLIT,$0
   322  #define pTbl DI
   323  #define aut SI
   324  #define tPtr CX
   325  #define autLen DX
   326  
   327  	MOVQ productTable+0(FP), pTbl
   328  	MOVQ data_base+8(FP), aut
   329  	MOVQ data_len+16(FP), autLen
   330  	MOVQ T+32(FP), tPtr
   331  
   332  	PXOR ACC0, ACC0
   333  	MOVOU bswapMask<>(SB), BSWAP
   334  	MOVOU gcmPoly<>(SB), POLY
   335  
   336  	MOVOU (16*14)(pTbl), T1
   337  	MOVOU (16*15)(pTbl), T2
   338  
   339  	TESTQ autLen, autLen
   340  	JEQ dataBail
   341  
   342  	CMPQ autLen, $13	// optimize the TLS case
   343  	JNE dataSinglesLoop
   344  
   345  	PXOR B0, B0
   346  	MOVQ (aut), B0
   347  	PINSRD $2, 8(aut), B0
   348  	PINSRB $12, 12(aut), B0
   349  	XORQ autLen, autLen
   350  	JMP dataMul
   351  
   352  dataSinglesLoop:
   353  
   354  		CMPQ autLen, $16
   355  		JB dataEnd
   356  		SUBQ $16, autLen
   357  
   358  		MOVOU (aut), B0
   359  dataMul:
   360  		PSHUFB BSWAP, B0
   361  		PXOR ACC0, B0
   362  
   363  		MOVOU T1, ACC0
   364  		MOVOU T2, ACCM
   365  		MOVOU T1, ACC1
   366  
   367  		PSHUFD $78, B0, T0
   368  		PXOR B0, T0
   369  		PCLMULQDQ $0x00, B0, ACC0
   370  		PCLMULQDQ $0x11, B0, ACC1
   371  		PCLMULQDQ $0x00, T0, ACCM
   372  
   373  		PXOR ACC0, ACCM
   374  		PXOR ACC1, ACCM
   375  		MOVOU ACCM, T0
   376  		PSRLDQ $8, ACCM
   377  		PSLLDQ $8, T0
   378  		PXOR ACCM, ACC1
   379  		PXOR T0, ACC0
   380  
   381  		MOVOU POLY, T0
   382  		PCLMULQDQ $0x01, ACC0, T0
   383  		PSHUFD $78, ACC0, ACC0
   384  		PXOR T0, ACC0
   385  
   386  		MOVOU POLY, T0
   387  		PCLMULQDQ $0x01, ACC0, T0
   388  		PSHUFD $78, ACC0, ACC0
   389  		PXOR T0, ACC0
   390  		PXOR ACC1, ACC0
   391  
   392  		LEAQ 16(aut), aut
   393  
   394  	JMP dataSinglesLoop
   395  
   396  dataEnd:
   397  
   398  	TESTQ autLen, autLen
   399  	JEQ dataBail
   400  
   401  	PXOR B0, B0
   402  	LEAQ -1(aut)(autLen*1), aut
   403  
   404  dataLoadLoop:
   405  
   406  		PSLLDQ $1, B0
   407  		PINSRB $0, (aut), B0
   408  
   409  		LEAQ -1(aut), aut
   410  		DECQ autLen
   411  		JNE dataLoadLoop
   412  
   413  	JMP dataMul
   414  
   415  dataBail:
   416  	MOVOU ACC0, (tPtr)
   417  	RET
   418  #undef pTbl
   419  #undef aut
   420  #undef tPtr
   421  #undef autLen
   422  
   423  // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   424  TEXT ·gcmAesEnc(SB),0,$256-96
   425  #define pTbl DI
   426  #define ctx DX
   427  #define ctrPtr CX
   428  #define ptx SI
   429  #define ks AX
   430  #define tPtr R8
   431  #define ptxLen R9
   432  #define aluCTR R10
   433  #define aluTMP R11
   434  #define aluK R12
   435  #define NR R13
   436  
   437  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   438  #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   439  #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   440  #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   441  #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   442  #define combinedRound(i) \
   443  	MOVOU (16*i)(ks), T0;\
   444  	AESENC T0, B0;\
   445  	AESENC T0, B1;\
   446  	AESENC T0, B2;\
   447  	AESENC T0, B3;\
   448  	 MOVOU (16*(i*2))(pTbl), T1;\
   449  	 MOVOU T1, T2;\
   450  	AESENC T0, B4;\
   451  	AESENC T0, B5;\
   452  	AESENC T0, B6;\
   453  	AESENC T0, B7;\
   454  	 MOVOU (16*i)(SP), T0;\
   455  	 PCLMULQDQ $0x00, T0, T1;\
   456  	 PXOR T1, ACC0;\
   457  	 PSHUFD $78, T0, T1;\
   458  	 PCLMULQDQ $0x11, T0, T2;\
   459  	 PXOR T1, T0;\
   460  	 PXOR T2, ACC1;\
   461  	 MOVOU (16*(i*2+1))(pTbl), T2;\
   462  	 PCLMULQDQ $0x00, T2, T0;\
   463  	 PXOR T0, ACCM
   464  #define mulRound(i) \
   465  	MOVOU (16*i)(SP), T0;\
   466  	MOVOU (16*(i*2))(pTbl), T1;\
   467  	MOVOU T1, T2;\
   468  	PCLMULQDQ $0x00, T0, T1;\
   469  	PXOR T1, ACC0;\
   470  	PCLMULQDQ $0x11, T0, T2;\
   471  	PXOR T2, ACC1;\
   472  	PSHUFD $78, T0, T1;\
   473  	PXOR T1, T0;\
   474  	MOVOU (16*(i*2+1))(pTbl), T1;\
   475  	PCLMULQDQ $0x00, T0, T1;\
   476  	PXOR T1, ACCM
   477  
   478  	MOVQ productTable+0(FP), pTbl
   479  	MOVQ dst+8(FP), ctx
   480  	MOVQ src_base+32(FP), ptx
   481  	MOVQ src_len+40(FP), ptxLen
   482  	MOVQ ctr+56(FP), ctrPtr
   483  	MOVQ T+64(FP), tPtr
   484  	MOVQ ks_base+72(FP), ks
   485  	MOVQ ks_len+80(FP), NR
   486  
   487  	SHRQ $2, NR
   488  	DECQ NR
   489  
   490  	MOVOU bswapMask<>(SB), BSWAP
   491  	MOVOU gcmPoly<>(SB), POLY
   492  
   493  	MOVOU (tPtr), ACC0
   494  	PXOR ACC1, ACC1
   495  	PXOR ACCM, ACCM
   496  	MOVOU (ctrPtr), B0
   497  	MOVL (3*4)(ctrPtr), aluCTR
   498  	MOVOU (ks), T0
   499  	MOVL (3*4)(ks), aluK
   500  	BSWAPL aluCTR
   501  	BSWAPL aluK
   502  
   503  	PXOR B0, T0
   504  	MOVOU T0, (8*16 + 0*16)(SP)
   505  	increment(0)
   506  
   507  	CMPQ ptxLen, $128
   508  	JB gcmAesEncSingles
   509  	SUBQ $128, ptxLen
   510  
   511  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   512  	MOVOU T0, (8*16 + 1*16)(SP)
   513  	increment(1)
   514  	MOVOU T0, (8*16 + 2*16)(SP)
   515  	increment(2)
   516  	MOVOU T0, (8*16 + 3*16)(SP)
   517  	increment(3)
   518  	MOVOU T0, (8*16 + 4*16)(SP)
   519  	increment(4)
   520  	MOVOU T0, (8*16 + 5*16)(SP)
   521  	increment(5)
   522  	MOVOU T0, (8*16 + 6*16)(SP)
   523  	increment(6)
   524  	MOVOU T0, (8*16 + 7*16)(SP)
   525  	increment(7)
   526  
   527  	MOVOU (8*16 + 0*16)(SP), B0
   528  	MOVOU (8*16 + 1*16)(SP), B1
   529  	MOVOU (8*16 + 2*16)(SP), B2
   530  	MOVOU (8*16 + 3*16)(SP), B3
   531  	MOVOU (8*16 + 4*16)(SP), B4
   532  	MOVOU (8*16 + 5*16)(SP), B5
   533  	MOVOU (8*16 + 6*16)(SP), B6
   534  	MOVOU (8*16 + 7*16)(SP), B7
   535  
   536  	aesRound(1)
   537  	increment(0)
   538  	aesRound(2)
   539  	increment(1)
   540  	aesRound(3)
   541  	increment(2)
   542  	aesRound(4)
   543  	increment(3)
   544  	aesRound(5)
   545  	increment(4)
   546  	aesRound(6)
   547  	increment(5)
   548  	aesRound(7)
   549  	increment(6)
   550  	aesRound(8)
   551  	increment(7)
   552  	aesRound(9)
   553  	MOVOU (16*10)(ks), T0
   554  	CMPQ NR, $12
   555  	JB encLast1
   556  	aesRnd(T0)
   557  	aesRound(11)
   558  	MOVOU (16*12)(ks), T0
   559  	JE encLast1
   560  	aesRnd(T0)
   561  	aesRound(13)
   562  	MOVOU (16*14)(ks), T0
   563  encLast1:
   564  	aesRndLast(T0)
   565  
   566  	MOVOU (16*0)(ptx), T0
   567  	PXOR T0, B0
   568  	MOVOU (16*1)(ptx), T0
   569  	PXOR T0, B1
   570  	MOVOU (16*2)(ptx), T0
   571  	PXOR T0, B2
   572  	MOVOU (16*3)(ptx), T0
   573  	PXOR T0, B3
   574  	MOVOU (16*4)(ptx), T0
   575  	PXOR T0, B4
   576  	MOVOU (16*5)(ptx), T0
   577  	PXOR T0, B5
   578  	MOVOU (16*6)(ptx), T0
   579  	PXOR T0, B6
   580  	MOVOU (16*7)(ptx), T0
   581  	PXOR T0, B7
   582  
   583  	MOVOU B0, (16*0)(ctx)
   584  	PSHUFB BSWAP, B0
   585  	PXOR ACC0, B0
   586  	MOVOU B1, (16*1)(ctx)
   587  	PSHUFB BSWAP, B1
   588  	MOVOU B2, (16*2)(ctx)
   589  	PSHUFB BSWAP, B2
   590  	MOVOU B3, (16*3)(ctx)
   591  	PSHUFB BSWAP, B3
   592  	MOVOU B4, (16*4)(ctx)
   593  	PSHUFB BSWAP, B4
   594  	MOVOU B5, (16*5)(ctx)
   595  	PSHUFB BSWAP, B5
   596  	MOVOU B6, (16*6)(ctx)
   597  	PSHUFB BSWAP, B6
   598  	MOVOU B7, (16*7)(ctx)
   599  	PSHUFB BSWAP, B7
   600  
   601  	MOVOU B0, (16*0)(SP)
   602  	MOVOU B1, (16*1)(SP)
   603  	MOVOU B2, (16*2)(SP)
   604  	MOVOU B3, (16*3)(SP)
   605  	MOVOU B4, (16*4)(SP)
   606  	MOVOU B5, (16*5)(SP)
   607  	MOVOU B6, (16*6)(SP)
   608  	MOVOU B7, (16*7)(SP)
   609  
   610  	LEAQ 128(ptx), ptx
   611  	LEAQ 128(ctx), ctx
   612  
   613  gcmAesEncOctetsLoop:
   614  
   615  		CMPQ ptxLen, $128
   616  		JB gcmAesEncOctetsEnd
   617  		SUBQ $128, ptxLen
   618  
   619  		MOVOU (8*16 + 0*16)(SP), B0
   620  		MOVOU (8*16 + 1*16)(SP), B1
   621  		MOVOU (8*16 + 2*16)(SP), B2
   622  		MOVOU (8*16 + 3*16)(SP), B3
   623  		MOVOU (8*16 + 4*16)(SP), B4
   624  		MOVOU (8*16 + 5*16)(SP), B5
   625  		MOVOU (8*16 + 6*16)(SP), B6
   626  		MOVOU (8*16 + 7*16)(SP), B7
   627  
   628  		MOVOU (16*0)(SP), T0
   629  		PSHUFD $78, T0, T1
   630  		PXOR T0, T1
   631  
   632  		MOVOU (16*0)(pTbl), ACC0
   633  		MOVOU (16*1)(pTbl), ACCM
   634  		MOVOU ACC0, ACC1
   635  
   636  		PCLMULQDQ $0x00, T1, ACCM
   637  		PCLMULQDQ $0x00, T0, ACC0
   638  		PCLMULQDQ $0x11, T0, ACC1
   639  
   640  		combinedRound(1)
   641  		increment(0)
   642  		combinedRound(2)
   643  		increment(1)
   644  		combinedRound(3)
   645  		increment(2)
   646  		combinedRound(4)
   647  		increment(3)
   648  		combinedRound(5)
   649  		increment(4)
   650  		combinedRound(6)
   651  		increment(5)
   652  		combinedRound(7)
   653  		increment(6)
   654  
   655  		aesRound(8)
   656  		increment(7)
   657  
   658  		PXOR ACC0, ACCM
   659  		PXOR ACC1, ACCM
   660  		MOVOU ACCM, T0
   661  		PSRLDQ $8, ACCM
   662  		PSLLDQ $8, T0
   663  		PXOR ACCM, ACC1
   664  		PXOR T0, ACC0
   665  
   666  		reduceRound(ACC0)
   667  		aesRound(9)
   668  
   669  		reduceRound(ACC0)
   670  		PXOR ACC1, ACC0
   671  
   672  		MOVOU (16*10)(ks), T0
   673  		CMPQ NR, $12
   674  		JB encLast2
   675  		aesRnd(T0)
   676  		aesRound(11)
   677  		MOVOU (16*12)(ks), T0
   678  		JE encLast2
   679  		aesRnd(T0)
   680  		aesRound(13)
   681  		MOVOU (16*14)(ks), T0
   682  encLast2:
   683  		aesRndLast(T0)
   684  
   685  		MOVOU (16*0)(ptx), T0
   686  		PXOR T0, B0
   687  		MOVOU (16*1)(ptx), T0
   688  		PXOR T0, B1
   689  		MOVOU (16*2)(ptx), T0
   690  		PXOR T0, B2
   691  		MOVOU (16*3)(ptx), T0
   692  		PXOR T0, B3
   693  		MOVOU (16*4)(ptx), T0
   694  		PXOR T0, B4
   695  		MOVOU (16*5)(ptx), T0
   696  		PXOR T0, B5
   697  		MOVOU (16*6)(ptx), T0
   698  		PXOR T0, B6
   699  		MOVOU (16*7)(ptx), T0
   700  		PXOR T0, B7
   701  
   702  		MOVOU B0, (16*0)(ctx)
   703  		PSHUFB BSWAP, B0
   704  		PXOR ACC0, B0
   705  		MOVOU B1, (16*1)(ctx)
   706  		PSHUFB BSWAP, B1
   707  		MOVOU B2, (16*2)(ctx)
   708  		PSHUFB BSWAP, B2
   709  		MOVOU B3, (16*3)(ctx)
   710  		PSHUFB BSWAP, B3
   711  		MOVOU B4, (16*4)(ctx)
   712  		PSHUFB BSWAP, B4
   713  		MOVOU B5, (16*5)(ctx)
   714  		PSHUFB BSWAP, B5
   715  		MOVOU B6, (16*6)(ctx)
   716  		PSHUFB BSWAP, B6
   717  		MOVOU B7, (16*7)(ctx)
   718  		PSHUFB BSWAP, B7
   719  
   720  		MOVOU B0, (16*0)(SP)
   721  		MOVOU B1, (16*1)(SP)
   722  		MOVOU B2, (16*2)(SP)
   723  		MOVOU B3, (16*3)(SP)
   724  		MOVOU B4, (16*4)(SP)
   725  		MOVOU B5, (16*5)(SP)
   726  		MOVOU B6, (16*6)(SP)
   727  		MOVOU B7, (16*7)(SP)
   728  
   729  		LEAQ 128(ptx), ptx
   730  		LEAQ 128(ctx), ctx
   731  
   732  		JMP gcmAesEncOctetsLoop
   733  
   734  gcmAesEncOctetsEnd:
   735  
   736  	MOVOU (16*0)(SP), T0
   737  	MOVOU (16*0)(pTbl), ACC0
   738  	MOVOU (16*1)(pTbl), ACCM
   739  	MOVOU ACC0, ACC1
   740  	PSHUFD $78, T0, T1
   741  	PXOR T0, T1
   742  	PCLMULQDQ $0x00, T0, ACC0
   743  	PCLMULQDQ $0x11, T0, ACC1
   744  	PCLMULQDQ $0x00, T1, ACCM
   745  
   746  	mulRound(1)
   747  	mulRound(2)
   748  	mulRound(3)
   749  	mulRound(4)
   750  	mulRound(5)
   751  	mulRound(6)
   752  	mulRound(7)
   753  
   754  	PXOR ACC0, ACCM
   755  	PXOR ACC1, ACCM
   756  	MOVOU ACCM, T0
   757  	PSRLDQ $8, ACCM
   758  	PSLLDQ $8, T0
   759  	PXOR ACCM, ACC1
   760  	PXOR T0, ACC0
   761  
   762  	reduceRound(ACC0)
   763  	reduceRound(ACC0)
   764  	PXOR ACC1, ACC0
   765  
   766  	TESTQ ptxLen, ptxLen
   767  	JE gcmAesEncDone
   768  
   769  	SUBQ $7, aluCTR
   770  
   771  gcmAesEncSingles:
   772  
   773  	MOVOU (16*1)(ks), B1
   774  	MOVOU (16*2)(ks), B2
   775  	MOVOU (16*3)(ks), B3
   776  	MOVOU (16*4)(ks), B4
   777  	MOVOU (16*5)(ks), B5
   778  	MOVOU (16*6)(ks), B6
   779  	MOVOU (16*7)(ks), B7
   780  
   781  	MOVOU (16*14)(pTbl), T2
   782  
   783  gcmAesEncSinglesLoop:
   784  
   785  		CMPQ ptxLen, $16
   786  		JB gcmAesEncTail
   787  		SUBQ $16, ptxLen
   788  
   789  		MOVOU (8*16 + 0*16)(SP), B0
   790  		increment(0)
   791  
   792  		AESENC B1, B0
   793  		AESENC B2, B0
   794  		AESENC B3, B0
   795  		AESENC B4, B0
   796  		AESENC B5, B0
   797  		AESENC B6, B0
   798  		AESENC B7, B0
   799  		MOVOU (16*8)(ks), T0
   800  		AESENC T0, B0
   801  		MOVOU (16*9)(ks), T0
   802  		AESENC T0, B0
   803  		MOVOU (16*10)(ks), T0
   804  		CMPQ NR, $12
   805  		JB encLast3
   806  		AESENC T0, B0
   807  		MOVOU (16*11)(ks), T0
   808  		AESENC T0, B0
   809  		MOVOU (16*12)(ks), T0
   810  		JE encLast3
   811  		AESENC T0, B0
   812  		MOVOU (16*13)(ks), T0
   813  		AESENC T0, B0
   814  		MOVOU (16*14)(ks), T0
   815  encLast3:
   816  		AESENCLAST T0, B0
   817  
   818  		MOVOU (ptx), T0
   819  		PXOR T0, B0
   820  		MOVOU B0, (ctx)
   821  
   822  		PSHUFB BSWAP, B0
   823  		PXOR ACC0, B0
   824  
   825  		MOVOU T2, ACC0
   826  		MOVOU T2, ACC1
   827  		MOVOU (16*15)(pTbl), ACCM
   828  
   829  		PSHUFD $78, B0, T0
   830  		PXOR B0, T0
   831  		PCLMULQDQ $0x00, B0, ACC0
   832  		PCLMULQDQ $0x11, B0, ACC1
   833  		PCLMULQDQ $0x00, T0, ACCM
   834  
   835  		PXOR ACC0, ACCM
   836  		PXOR ACC1, ACCM
   837  		MOVOU ACCM, T0
   838  		PSRLDQ $8, ACCM
   839  		PSLLDQ $8, T0
   840  		PXOR ACCM, ACC1
   841  		PXOR T0, ACC0
   842  
   843  		reduceRound(ACC0)
   844  		reduceRound(ACC0)
   845  		PXOR ACC1, ACC0
   846  
   847  		LEAQ (16*1)(ptx), ptx
   848  		LEAQ (16*1)(ctx), ctx
   849  
   850  	JMP gcmAesEncSinglesLoop
   851  
   852  gcmAesEncTail:
   853  	TESTQ ptxLen, ptxLen
   854  	JE gcmAesEncDone
   855  
   856  	MOVOU (8*16 + 0*16)(SP), B0
   857  	AESENC B1, B0
   858  	AESENC B2, B0
   859  	AESENC B3, B0
   860  	AESENC B4, B0
   861  	AESENC B5, B0
   862  	AESENC B6, B0
   863  	AESENC B7, B0
   864  	MOVOU (16*8)(ks), T0
   865  	AESENC T0, B0
   866  	MOVOU (16*9)(ks), T0
   867  	AESENC T0, B0
   868  	MOVOU (16*10)(ks), T0
   869  	CMPQ NR, $12
   870  	JB encLast4
   871  	AESENC T0, B0
   872  	MOVOU (16*11)(ks), T0
   873  	AESENC T0, B0
   874  	MOVOU (16*12)(ks), T0
   875  	JE encLast4
   876  	AESENC T0, B0
   877  	MOVOU (16*13)(ks), T0
   878  	AESENC T0, B0
   879  	MOVOU (16*14)(ks), T0
   880  encLast4:
   881  	AESENCLAST T0, B0
   882  	MOVOU B0, T0
   883  
   884  	LEAQ -1(ptx)(ptxLen*1), ptx
   885  
   886  	MOVQ ptxLen, aluTMP
   887  	SHLQ $4, aluTMP
   888  
   889  	LEAQ andMask<>(SB), aluCTR
   890  	MOVOU -16(aluCTR)(aluTMP*1), T1
   891  
   892  	PXOR B0, B0
   893  ptxLoadLoop:
   894  		PSLLDQ $1, B0
   895  		PINSRB $0, (ptx), B0
   896  		LEAQ -1(ptx), ptx
   897  		DECQ ptxLen
   898  	JNE ptxLoadLoop
   899  
   900  	PXOR T0, B0
   901  	PAND T1, B0
   902  	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   903  
   904  	PSHUFB BSWAP, B0
   905  	PXOR ACC0, B0
   906  
   907  	MOVOU T2, ACC0
   908  	MOVOU T2, ACC1
   909  	MOVOU (16*15)(pTbl), ACCM
   910  
   911  	PSHUFD $78, B0, T0
   912  	PXOR B0, T0
   913  	PCLMULQDQ $0x00, B0, ACC0
   914  	PCLMULQDQ $0x11, B0, ACC1
   915  	PCLMULQDQ $0x00, T0, ACCM
   916  
   917  	PXOR ACC0, ACCM
   918  	PXOR ACC1, ACCM
   919  	MOVOU ACCM, T0
   920  	PSRLDQ $8, ACCM
   921  	PSLLDQ $8, T0
   922  	PXOR ACCM, ACC1
   923  	PXOR T0, ACC0
   924  
   925  	reduceRound(ACC0)
   926  	reduceRound(ACC0)
   927  	PXOR ACC1, ACC0
   928  
   929  gcmAesEncDone:
   930  	MOVOU ACC0, (tPtr)
   931  	RET
   932  #undef increment
   933  
   934  // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   935  TEXT ·gcmAesDec(SB),0,$128-96
   936  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
   937  #define combinedDecRound(i) \
   938  	MOVOU (16*i)(ks), T0;\
   939  	AESENC T0, B0;\
   940  	AESENC T0, B1;\
   941  	AESENC T0, B2;\
   942  	AESENC T0, B3;\
   943  	MOVOU (16*(i*2))(pTbl), T1;\
   944  	MOVOU T1, T2;\
   945  	AESENC T0, B4;\
   946  	AESENC T0, B5;\
   947  	AESENC T0, B6;\
   948  	AESENC T0, B7;\
   949  	MOVOU (16*i)(ctx), T0;\
   950  	PSHUFB BSWAP, T0;\
   951  	PCLMULQDQ $0x00, T0, T1;\
   952  	PXOR T1, ACC0;\
   953  	PSHUFD $78, T0, T1;\
   954  	PCLMULQDQ $0x11, T0, T2;\
   955  	PXOR T1, T0;\
   956  	PXOR T2, ACC1;\
   957  	MOVOU (16*(i*2+1))(pTbl), T2;\
   958  	PCLMULQDQ $0x00, T2, T0;\
   959  	PXOR T0, ACCM
   960  
   961  	MOVQ productTable+0(FP), pTbl
   962  	MOVQ dst+8(FP), ptx
   963  	MOVQ src_base+32(FP), ctx
   964  	MOVQ src_len+40(FP), ptxLen
   965  	MOVQ ctr+56(FP), ctrPtr
   966  	MOVQ T+64(FP), tPtr
   967  	MOVQ ks_base+72(FP), ks
   968  	MOVQ ks_len+80(FP), NR
   969  
   970  	SHRQ $2, NR
   971  	DECQ NR
   972  
   973  	MOVOU bswapMask<>(SB), BSWAP
   974  	MOVOU gcmPoly<>(SB), POLY
   975  
   976  	MOVOU (tPtr), ACC0
   977  	PXOR ACC1, ACC1
   978  	PXOR ACCM, ACCM
   979  	MOVOU (ctrPtr), B0
   980  	MOVL (3*4)(ctrPtr), aluCTR
   981  	MOVOU (ks), T0
   982  	MOVL (3*4)(ks), aluK
   983  	BSWAPL aluCTR
   984  	BSWAPL aluK
   985  
   986  	PXOR B0, T0
   987  	MOVOU T0, (0*16)(SP)
   988  	increment(0)
   989  
   990  	CMPQ ptxLen, $128
   991  	JB gcmAesDecSingles
   992  
   993  	MOVOU T0, (1*16)(SP)
   994  	increment(1)
   995  	MOVOU T0, (2*16)(SP)
   996  	increment(2)
   997  	MOVOU T0, (3*16)(SP)
   998  	increment(3)
   999  	MOVOU T0, (4*16)(SP)
  1000  	increment(4)
  1001  	MOVOU T0, (5*16)(SP)
  1002  	increment(5)
  1003  	MOVOU T0, (6*16)(SP)
  1004  	increment(6)
  1005  	MOVOU T0, (7*16)(SP)
  1006  	increment(7)
  1007  
  1008  gcmAesDecOctetsLoop:
  1009  
  1010  		CMPQ ptxLen, $128
  1011  		JB gcmAesDecEndOctets
  1012  		SUBQ $128, ptxLen
  1013  
  1014  		MOVOU (0*16)(SP), B0
  1015  		MOVOU (1*16)(SP), B1
  1016  		MOVOU (2*16)(SP), B2
  1017  		MOVOU (3*16)(SP), B3
  1018  		MOVOU (4*16)(SP), B4
  1019  		MOVOU (5*16)(SP), B5
  1020  		MOVOU (6*16)(SP), B6
  1021  		MOVOU (7*16)(SP), B7
  1022  
  1023  		MOVOU (16*0)(ctx), T0
  1024  		PSHUFB BSWAP, T0
  1025  		PXOR ACC0, T0
  1026  		PSHUFD $78, T0, T1
  1027  		PXOR T0, T1
  1028  
  1029  		MOVOU (16*0)(pTbl), ACC0
  1030  		MOVOU (16*1)(pTbl), ACCM
  1031  		MOVOU ACC0, ACC1
  1032  
  1033  		PCLMULQDQ $0x00, T1, ACCM
  1034  		PCLMULQDQ $0x00, T0, ACC0
  1035  		PCLMULQDQ $0x11, T0, ACC1
  1036  
  1037  		combinedDecRound(1)
  1038  		increment(0)
  1039  		combinedDecRound(2)
  1040  		increment(1)
  1041  		combinedDecRound(3)
  1042  		increment(2)
  1043  		combinedDecRound(4)
  1044  		increment(3)
  1045  		combinedDecRound(5)
  1046  		increment(4)
  1047  		combinedDecRound(6)
  1048  		increment(5)
  1049  		combinedDecRound(7)
  1050  		increment(6)
  1051  
  1052  		aesRound(8)
  1053  		increment(7)
  1054  
  1055  		PXOR ACC0, ACCM
  1056  		PXOR ACC1, ACCM
  1057  		MOVOU ACCM, T0
  1058  		PSRLDQ $8, ACCM
  1059  		PSLLDQ $8, T0
  1060  		PXOR ACCM, ACC1
  1061  		PXOR T0, ACC0
  1062  
  1063  		reduceRound(ACC0)
  1064  		aesRound(9)
  1065  
  1066  		reduceRound(ACC0)
  1067  		PXOR ACC1, ACC0
  1068  
  1069  		MOVOU (16*10)(ks), T0
  1070  		CMPQ NR, $12
  1071  		JB decLast1
  1072  		aesRnd(T0)
  1073  		aesRound(11)
  1074  		MOVOU (16*12)(ks), T0
  1075  		JE decLast1
  1076  		aesRnd(T0)
  1077  		aesRound(13)
  1078  		MOVOU (16*14)(ks), T0
  1079  decLast1:
  1080  		aesRndLast(T0)
  1081  
  1082  		MOVOU (16*0)(ctx), T0
  1083  		PXOR T0, B0
  1084  		MOVOU (16*1)(ctx), T0
  1085  		PXOR T0, B1
  1086  		MOVOU (16*2)(ctx), T0
  1087  		PXOR T0, B2
  1088  		MOVOU (16*3)(ctx), T0
  1089  		PXOR T0, B3
  1090  		MOVOU (16*4)(ctx), T0
  1091  		PXOR T0, B4
  1092  		MOVOU (16*5)(ctx), T0
  1093  		PXOR T0, B5
  1094  		MOVOU (16*6)(ctx), T0
  1095  		PXOR T0, B6
  1096  		MOVOU (16*7)(ctx), T0
  1097  		PXOR T0, B7
  1098  
  1099  		MOVOU B0, (16*0)(ptx)
  1100  		MOVOU B1, (16*1)(ptx)
  1101  		MOVOU B2, (16*2)(ptx)
  1102  		MOVOU B3, (16*3)(ptx)
  1103  		MOVOU B4, (16*4)(ptx)
  1104  		MOVOU B5, (16*5)(ptx)
  1105  		MOVOU B6, (16*6)(ptx)
  1106  		MOVOU B7, (16*7)(ptx)
  1107  
  1108  		LEAQ 128(ptx), ptx
  1109  		LEAQ 128(ctx), ctx
  1110  
  1111  		JMP gcmAesDecOctetsLoop
  1112  
  1113  gcmAesDecEndOctets:
  1114  
  1115  	SUBQ $7, aluCTR
  1116  
  1117  gcmAesDecSingles:
  1118  
  1119  	MOVOU (16*1)(ks), B1
  1120  	MOVOU (16*2)(ks), B2
  1121  	MOVOU (16*3)(ks), B3
  1122  	MOVOU (16*4)(ks), B4
  1123  	MOVOU (16*5)(ks), B5
  1124  	MOVOU (16*6)(ks), B6
  1125  	MOVOU (16*7)(ks), B7
  1126  
  1127  	MOVOU (16*14)(pTbl), T2
  1128  
  1129  gcmAesDecSinglesLoop:
  1130  
  1131  		CMPQ ptxLen, $16
  1132  		JB gcmAesDecTail
  1133  		SUBQ $16, ptxLen
  1134  
  1135  		MOVOU (ctx), B0
  1136  		MOVOU B0, T1
  1137  		PSHUFB BSWAP, B0
  1138  		PXOR ACC0, B0
  1139  
  1140  		MOVOU T2, ACC0
  1141  		MOVOU T2, ACC1
  1142  		MOVOU (16*15)(pTbl), ACCM
  1143  
  1144  		PCLMULQDQ $0x00, B0, ACC0
  1145  		PCLMULQDQ $0x11, B0, ACC1
  1146  		PSHUFD $78, B0, T0
  1147  		PXOR B0, T0
  1148  		PCLMULQDQ $0x00, T0, ACCM
  1149  
  1150  		PXOR ACC0, ACCM
  1151  		PXOR ACC1, ACCM
  1152  		MOVOU ACCM, T0
  1153  		PSRLDQ $8, ACCM
  1154  		PSLLDQ $8, T0
  1155  		PXOR ACCM, ACC1
  1156  		PXOR T0, ACC0
  1157  
  1158  		reduceRound(ACC0)
  1159  		reduceRound(ACC0)
  1160  		PXOR ACC1, ACC0
  1161  
  1162  		MOVOU (0*16)(SP), B0
  1163  		increment(0)
  1164  		AESENC B1, B0
  1165  		AESENC B2, B0
  1166  		AESENC B3, B0
  1167  		AESENC B4, B0
  1168  		AESENC B5, B0
  1169  		AESENC B6, B0
  1170  		AESENC B7, B0
  1171  		MOVOU (16*8)(ks), T0
  1172  		AESENC T0, B0
  1173  		MOVOU (16*9)(ks), T0
  1174  		AESENC T0, B0
  1175  		MOVOU (16*10)(ks), T0
  1176  		CMPQ NR, $12
  1177  		JB decLast2
  1178  		AESENC T0, B0
  1179  		MOVOU (16*11)(ks), T0
  1180  		AESENC T0, B0
  1181  		MOVOU (16*12)(ks), T0
  1182  		JE decLast2
  1183  		AESENC T0, B0
  1184  		MOVOU (16*13)(ks), T0
  1185  		AESENC T0, B0
  1186  		MOVOU (16*14)(ks), T0
  1187  decLast2:
  1188  		AESENCLAST T0, B0
  1189  
  1190  		PXOR T1, B0
  1191  		MOVOU B0, (ptx)
  1192  
  1193  		LEAQ (16*1)(ptx), ptx
  1194  		LEAQ (16*1)(ctx), ctx
  1195  
  1196  	JMP gcmAesDecSinglesLoop
  1197  
  1198  gcmAesDecTail:
  1199  
  1200  	TESTQ ptxLen, ptxLen
  1201  	JE gcmAesDecDone
  1202  
  1203  	MOVQ ptxLen, aluTMP
  1204  	SHLQ $4, aluTMP
  1205  	LEAQ andMask<>(SB), aluCTR
  1206  	MOVOU -16(aluCTR)(aluTMP*1), T1
  1207  
  1208  	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1209  	PAND T1, B0
  1210  
  1211  	MOVOU B0, T1
  1212  	PSHUFB BSWAP, B0
  1213  	PXOR ACC0, B0
  1214  
  1215  	MOVOU (16*14)(pTbl), ACC0
  1216  	MOVOU (16*15)(pTbl), ACCM
  1217  	MOVOU ACC0, ACC1
  1218  
  1219  	PCLMULQDQ $0x00, B0, ACC0
  1220  	PCLMULQDQ $0x11, B0, ACC1
  1221  	PSHUFD $78, B0, T0
  1222  	PXOR B0, T0
  1223  	PCLMULQDQ $0x00, T0, ACCM
  1224  
  1225  	PXOR ACC0, ACCM
  1226  	PXOR ACC1, ACCM
  1227  	MOVOU ACCM, T0
  1228  	PSRLDQ $8, ACCM
  1229  	PSLLDQ $8, T0
  1230  	PXOR ACCM, ACC1
  1231  	PXOR T0, ACC0
  1232  
  1233  	reduceRound(ACC0)
  1234  	reduceRound(ACC0)
  1235  	PXOR ACC1, ACC0
  1236  
  1237  	MOVOU (0*16)(SP), B0
  1238  	increment(0)
  1239  	AESENC B1, B0
  1240  	AESENC B2, B0
  1241  	AESENC B3, B0
  1242  	AESENC B4, B0
  1243  	AESENC B5, B0
  1244  	AESENC B6, B0
  1245  	AESENC B7, B0
  1246  	MOVOU (16*8)(ks), T0
  1247  	AESENC T0, B0
  1248  	MOVOU (16*9)(ks), T0
  1249  	AESENC T0, B0
  1250  	MOVOU (16*10)(ks), T0
  1251  	CMPQ NR, $12
  1252  	JB decLast3
  1253  	AESENC T0, B0
  1254  	MOVOU (16*11)(ks), T0
  1255  	AESENC T0, B0
  1256  	MOVOU (16*12)(ks), T0
  1257  	JE decLast3
  1258  	AESENC T0, B0
  1259  	MOVOU (16*13)(ks), T0
  1260  	AESENC T0, B0
  1261  	MOVOU (16*14)(ks), T0
  1262  decLast3:
  1263  	AESENCLAST T0, B0
  1264  	PXOR T1, B0
  1265  
  1266  ptxStoreLoop:
  1267  		PEXTRB $0, B0, (ptx)
  1268  		PSRLDQ $1, B0
  1269  		LEAQ 1(ptx), ptx
  1270  		DECQ ptxLen
  1271  
  1272  	JNE ptxStoreLoop
  1273  
  1274  gcmAesDecDone:
  1275  
  1276  	MOVOU ACC0, (tPtr)
  1277  	RET