github.com/mad-day/Yawning-crypto@v0.0.0-20190711051033-5a5f8cca32ec/aez/aez_amd64.py (about)

     1  #!/usr/bin/env python3
     2  #
     3  # To the extent possible under law, Yawning Angel has waived all copyright
     4  # and related or neighboring rights to aez, using the Creative
     5  # Commons "CC0" public domain dedication. See LICENSE or
     6  # <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
     7  
     8  #
     9  # Dependencies: https://github.com/Maratyszcza/PeachPy
    10  #
    11  # python3 -m peachpy.x86_64 -mabi=goasm -S -o aez_amd64.s aez_amd64.py
    12  #
    13  
    14  from peachpy import *
    15  from peachpy.x86_64 import *
    16  
    17  cpuidParams = Argument(ptr(uint32_t))
    18  
    19  with Function("cpuidAMD64", (cpuidParams,)):
    20      reg_params = registers.r15
    21      LOAD.ARGUMENT(reg_params, cpuidParams)
    22  
    23      MOV(registers.eax, [reg_params])
    24      MOV(registers.ecx, [reg_params+8])
    25  
    26      CPUID()
    27  
    28      MOV([reg_params], registers.eax)
    29      MOV([reg_params+4], registers.ebx)
    30      MOV([reg_params+8], registers.ecx)
    31      MOV([reg_params+12], registers.edx)
    32  
    33      RETURN()
    34  
    35  with Function("resetAMD64SSE2", ()):
    36      PXOR(registers.xmm0, registers.xmm0)
    37      PXOR(registers.xmm1, registers.xmm1)
    38      PXOR(registers.xmm2, registers.xmm2)
    39      PXOR(registers.xmm3, registers.xmm3)
    40      PXOR(registers.xmm4, registers.xmm4)
    41      PXOR(registers.xmm5, registers.xmm5)
    42      PXOR(registers.xmm6, registers.xmm6)
    43      PXOR(registers.xmm7, registers.xmm7)
    44      PXOR(registers.xmm8, registers.xmm8)
    45      PXOR(registers.xmm9, registers.xmm9)
    46      PXOR(registers.xmm10, registers.xmm10)
    47      PXOR(registers.xmm11, registers.xmm10)
    48      PXOR(registers.xmm12, registers.xmm12)
    49      PXOR(registers.xmm13, registers.xmm13)
    50      PXOR(registers.xmm14, registers.xmm14)
    51      PXOR(registers.xmm15, registers.xmm15)
    52      RETURN()
    53  
    54  a = Argument(ptr(const_uint8_t))
    55  b = Argument(ptr(const_uint8_t))
    56  c = Argument(ptr(const_uint8_t))
    57  d = Argument(ptr(const_uint8_t))
    58  dst = Argument(ptr(uint8_t))
    59  
    60  with Function("xorBytes1x16AMD64SSE2", (a, b, dst)):
    61      reg_a = GeneralPurposeRegister64()
    62      reg_b = GeneralPurposeRegister64()
    63      reg_dst = GeneralPurposeRegister64()
    64  
    65      LOAD.ARGUMENT(reg_a, a)
    66      LOAD.ARGUMENT(reg_b, b)
    67      LOAD.ARGUMENT(reg_dst, dst)
    68  
    69      xmm_a = XMMRegister()
    70      xmm_b = XMMRegister()
    71  
    72      MOVDQU(xmm_a, [reg_a])
    73      MOVDQU(xmm_b, [reg_b])
    74  
    75      PXOR(xmm_a, xmm_b)
    76  
    77      MOVDQU([reg_dst], xmm_a)
    78  
    79      RETURN()
    80  
    81  with Function("xorBytes4x16AMD64SSE2", (a, b, c, d, dst)):
    82      reg_a = GeneralPurposeRegister64()
    83      reg_b = GeneralPurposeRegister64()
    84      reg_c = GeneralPurposeRegister64()
    85      reg_d = GeneralPurposeRegister64()
    86      reg_dst = GeneralPurposeRegister64()
    87  
    88      LOAD.ARGUMENT(reg_a, a)
    89      LOAD.ARGUMENT(reg_b, b)
    90      LOAD.ARGUMENT(reg_c, c)
    91      LOAD.ARGUMENT(reg_d, d)
    92      LOAD.ARGUMENT(reg_dst, dst)
    93  
    94      xmm_a = XMMRegister()
    95      xmm_b = XMMRegister()
    96      xmm_c = XMMRegister()
    97      xmm_d = XMMRegister()
    98  
    99      MOVDQU(xmm_a, [reg_a])
   100      MOVDQU(xmm_b, [reg_b])
   101      MOVDQU(xmm_c, [reg_c])
   102      MOVDQU(xmm_d, [reg_d])
   103  
   104      PXOR(xmm_a, xmm_b)
   105      PXOR(xmm_c, xmm_d)
   106      PXOR(xmm_a, xmm_c)
   107  
   108      MOVDQU([reg_dst], xmm_a)
   109  
   110      RETURN()
   111  
   112  #
   113  #  AES-NI helper functions.
   114  #
   115  def aesenc4x1(o, j, i, l, z):
   116      AESENC(o, j)
   117      AESENC(o, i)
   118      AESENC(o, l)
   119      AESENC(o, z)
   120  
   121  def aesenc4x2(o0, o1, j, i, l, z):
   122      AESENC(o0, j)
   123      AESENC(o1, j)
   124      AESENC(o0, i)
   125      AESENC(o1, i)
   126      AESENC(o0, l)
   127      AESENC(o1, l)
   128      AESENC(o0, z)
   129      AESENC(o1, z)
   130  
   131  def aesenc4x4(o0, o1, o2, o3, j, i, l, z):
   132      AESENC(o0, j)
   133      AESENC(o1, j)
   134      AESENC(o2, j)
   135      AESENC(o3, j)
   136      AESENC(o0, i)
   137      AESENC(o1, i)
   138      AESENC(o2, i)
   139      AESENC(o3, i)
   140      AESENC(o0, l)
   141      AESENC(o1, l)
   142      AESENC(o2, l)
   143      AESENC(o3, l)
   144      AESENC(o0, z)
   145      AESENC(o1, z)
   146      AESENC(o2, z)
   147      AESENC(o3, z)
   148  
   149  def aesenc4x8(o0, o1, o2, o3, o4, o5, o6, o7, j, i, l, z):
   150      AESENC(o0, j)
   151      AESENC(o1, j)
   152      AESENC(o2, j)
   153      AESENC(o3, j)
   154      AESENC(o4, j)
   155      AESENC(o5, j)
   156      AESENC(o6, j)
   157      AESENC(o7, j)
   158      AESENC(o0, i)
   159      AESENC(o1, i)
   160      AESENC(o2, i)
   161      AESENC(o3, i)
   162      AESENC(o4, i)
   163      AESENC(o5, i)
   164      AESENC(o6, i)
   165      AESENC(o7, i)
   166      AESENC(o0, l)
   167      AESENC(o1, l)
   168      AESENC(o2, l)
   169      AESENC(o3, l)
   170      AESENC(o4, l)
   171      AESENC(o5, l)
   172      AESENC(o6, l)
   173      AESENC(o7, l)
   174      AESENC(o0, z)
   175      AESENC(o1, z)
   176      AESENC(o2, z)
   177      AESENC(o3, z)
   178      AESENC(o4, z)
   179      AESENC(o5, z)
   180      AESENC(o6, z)
   181      AESENC(o7, z)
   182  
   183  #
   184  # Sigh.  PeachPy has "interesting" ideas of definitions for certain things,
   185  # so just use the `zen` uarch, because it supports everything.
   186  #
   187  
   188  j = Argument(ptr(const_uint8_t))
   189  i = Argument(ptr(const_uint8_t))
   190  l = Argument(ptr(const_uint8_t))
   191  k = Argument(ptr(const_uint8_t))
   192  src = Argument(ptr(uint8_t))
   193  
   194  with Function("aezAES4AMD64AESNI", (j, i, l, k, src, dst), target=uarch.zen):
   195      reg_j = GeneralPurposeRegister64()
   196      reg_i = GeneralPurposeRegister64()
   197      reg_l = GeneralPurposeRegister64()
   198      reg_k = GeneralPurposeRegister64()
   199      reg_src = GeneralPurposeRegister64()
   200      reg_dst = GeneralPurposeRegister64()
   201  
   202      LOAD.ARGUMENT(reg_j, j)
   203      LOAD.ARGUMENT(reg_i, i)
   204      LOAD.ARGUMENT(reg_l, l)
   205      LOAD.ARGUMENT(reg_k, k)
   206      LOAD.ARGUMENT(reg_src, src)
   207      LOAD.ARGUMENT(reg_dst, dst)
   208  
   209      xmm_state = XMMRegister()
   210      xmm_j = XMMRegister()
   211      xmm_i = XMMRegister()
   212      xmm_l = XMMRegister()
   213      xmm_zero = XMMRegister()
   214  
   215      MOVDQU(xmm_state, [reg_src])
   216      MOVDQA(xmm_j, [reg_j])
   217      MOVDQA(xmm_i, [reg_i])
   218      MOVDQA(xmm_l, [reg_l])
   219  
   220      PXOR(xmm_state, xmm_j)
   221      PXOR(xmm_i, xmm_l)
   222      PXOR(xmm_state, xmm_i)
   223      PXOR(xmm_zero, xmm_zero)
   224  
   225      MOVDQA(xmm_i, [reg_k])
   226      MOVDQA(xmm_j, [reg_k+16])
   227      MOVDQA(xmm_l, [reg_k+32])
   228  
   229      aesenc4x1(xmm_state, xmm_j, xmm_i, xmm_l, xmm_zero)
   230  
   231      MOVDQU([reg_dst], xmm_state)
   232  
   233      RETURN()
   234  
   235  with Function("aezAES10AMD64AESNI", (l, k, src, dst), target=uarch.zen):
   236      reg_l = GeneralPurposeRegister64()
   237      reg_k = GeneralPurposeRegister64()
   238      reg_src = GeneralPurposeRegister64()
   239      reg_dst = GeneralPurposeRegister64()
   240  
   241      LOAD.ARGUMENT(reg_l, l)
   242      LOAD.ARGUMENT(reg_k, k)
   243      LOAD.ARGUMENT(reg_src, src)
   244      LOAD.ARGUMENT(reg_dst, dst)
   245  
   246      MOVDQU(xmm_state, [reg_src])
   247      MOVDQU(xmm_l, [reg_l])
   248  
   249      PXOR(xmm_state, xmm_l)
   250  
   251      MOVDQA(xmm_i, [reg_k])
   252      MOVDQA(xmm_j, [reg_k+16])
   253      MOVDQA(xmm_l, [reg_k+32])
   254  
   255      AESENC(xmm_state, xmm_i)
   256      AESENC(xmm_state, xmm_j)
   257      AESENC(xmm_state, xmm_l)
   258      AESENC(xmm_state, xmm_i)
   259      AESENC(xmm_state, xmm_j)
   260      AESENC(xmm_state, xmm_l)
   261      AESENC(xmm_state, xmm_i)
   262      AESENC(xmm_state, xmm_j)
   263      AESENC(xmm_state, xmm_l)
   264      AESENC(xmm_state, xmm_i)
   265  
   266      MOVDQU([reg_dst], xmm_state)
   267  
   268      RETURN()
   269  
   270  def doubleBlock(blk, tmp0, tmp1, c):
   271      MOVDQA(tmp0, [c])
   272      PSHUFB(blk, tmp0)
   273      MOVDQA(tmp1, blk)
   274      PSRAD(tmp1, 31)
   275      PAND(tmp1, [c+16])
   276      PSHUFD(tmp1, tmp1, 0x93)
   277      PSLLD(blk, 1)
   278      PXOR(blk, tmp1)
   279      PSHUFB(blk, tmp0)
   280  
   281  x = Argument(ptr(uint8_t))
   282  consts = Argument(ptr(const_uint8_t))
   283  sz = Argument(ptr(size_t))
   284  
   285  with Function("aezCorePass1AMD64AESNI", (src, dst, x, i, l, k, consts, sz), target=uarch.zen):
   286      # This would be better as a port of the aesni pass_one() routine,
   287      # however that requires storing some intermediaries in reversed
   288      # form.
   289  
   290      reg_src = GeneralPurposeRegister64()
   291      reg_dst = GeneralPurposeRegister64()
   292      reg_x = GeneralPurposeRegister64()
   293      reg_tmp = GeneralPurposeRegister64()
   294      reg_l = GeneralPurposeRegister64()
   295      reg_bytes = GeneralPurposeRegister64()
   296      reg_idx = GeneralPurposeRegister64()
   297  
   298      LOAD.ARGUMENT(reg_src, src)  # src pointer
   299      LOAD.ARGUMENT(reg_dst, dst)  # dst pointer
   300      LOAD.ARGUMENT(reg_x, x)
   301      LOAD.ARGUMENT(reg_l, l)      # e.L[]
   302      LOAD.ARGUMENT(reg_bytes, sz) # bytes remaining
   303      MOV(reg_idx, 1)              # Index into e.L[]
   304  
   305      xmm_j = XMMRegister()    # AESENC Round key J
   306      xmm_i = XMMRegister()    # AESENC Round key I
   307      xmm_l = XMMRegister()    # AESENC Round Key L
   308      xmm_x = XMMRegister()    # Checksum X
   309      xmm_iDbl = XMMRegister() # e.I[1]
   310      xmm_tmp0 = XMMRegister()
   311      xmm_tmp1 = XMMRegister()
   312      xmm_zero = XMMRegister() # [16]byte{0x00}
   313  
   314      xmm_o0 = XMMRegister()
   315      xmm_o1 = XMMRegister()
   316      xmm_o2 = XMMRegister()
   317      xmm_o3 = XMMRegister()
   318      xmm_o4 = XMMRegister()
   319      xmm_o5 = XMMRegister()
   320      xmm_o6 = XMMRegister()
   321      xmm_o7 = XMMRegister()
   322  
   323      MOVDQU(xmm_x, [reg_x])
   324  
   325      LOAD.ARGUMENT(reg_tmp, i)
   326      MOVDQU(xmm_iDbl, [reg_tmp])
   327  
   328      LOAD.ARGUMENT(reg_tmp, k)
   329      MOVDQU(xmm_i, [reg_tmp])
   330      MOVDQU(xmm_j, [reg_tmp+16])
   331      MOVDQU(xmm_l, [reg_tmp+32])
   332  
   333      LOAD.ARGUMENT(reg_tmp, consts) # doubleBlock constants
   334  
   335      PXOR(xmm_zero, xmm_zero)
   336  
   337      # Process 16 * 16 bytes at a time in a loop.
   338      vector_loop256 = Loop()
   339      SUB(reg_bytes, 256)
   340      JB(vector_loop256.end)
   341      with vector_loop256:
   342          # TODO: Make better use of registers, optimize scheduling.
   343  
   344          # o0 = aes4(o0 ^ J ^ I ^ L[1], keys) // E(1,1)
   345          # o1 = aes4(o1 ^ J ^ I ^ L[2], keys) // E(1,2)
   346          # o2 = aes4(o2 ^ J ^ I ^ L[3], keys) // E(1,3)
   347          # o3 = aes4(o3 ^ J ^ I ^ L[4], keys) // E(1,4)
   348          # o4 = aes4(o4 ^ J ^ I ^ L[5], keys) // E(1,5)
   349          # o5 = aes4(o5 ^ J ^ I ^ L[6], keys) // E(1,6)
   350          # o6 = aes4(o6 ^ J ^ I ^ L[7], keys) // E(1,7)
   351          # o7 = aes4(o7 ^ J ^ I ^ L[0], keys) // E(1,0)
   352          MOVDQU(xmm_o0, [reg_src+16])
   353          MOVDQU(xmm_o1, [reg_src+48])
   354          MOVDQU(xmm_o2, [reg_src+80])
   355          MOVDQU(xmm_o3, [reg_src+112])
   356          MOVDQU(xmm_o4, [reg_src+144])
   357          MOVDQU(xmm_o5, [reg_src+176])
   358          MOVDQU(xmm_o6, [reg_src+208])
   359          MOVDQU(xmm_o7, [reg_src+240])
   360          MOVDQA(xmm_tmp0, xmm_j) # tmp = j ^ iDbl
   361          PXOR(xmm_tmp0, xmm_iDbl)
   362          PXOR(xmm_o0, xmm_tmp0)
   363          PXOR(xmm_o1, xmm_tmp0)
   364          PXOR(xmm_o2, xmm_tmp0)
   365          PXOR(xmm_o3, xmm_tmp0)
   366          PXOR(xmm_o4, xmm_tmp0)
   367          PXOR(xmm_o5, xmm_tmp0)
   368          PXOR(xmm_o6, xmm_tmp0)
   369          PXOR(xmm_o7, xmm_tmp0)
   370          PXOR(xmm_o0, [reg_l+16])  # L[1]
   371          PXOR(xmm_o1, [reg_l+32])  # L[2]
   372          PXOR(xmm_o2, [reg_l+48])  # L[3]
   373          PXOR(xmm_o3, [reg_l+64])  # L[4]
   374          PXOR(xmm_o4, [reg_l+80])  # L[5]
   375          PXOR(xmm_o5, [reg_l+96])  # L[6]
   376          PXOR(xmm_o6, [reg_l+112]) # L[7]
   377          PXOR(xmm_o7, [reg_l])     # L[0]
   378          aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
   379  
   380          # dst[  :] = in[  :] ^ o0
   381          # dst[32:] = in[32:] ^ o1
   382          # dst[64:] = in[64:] ^ o2
   383          # dst[96:] = in[96:] ^ o3
   384          # dst[128:] = in[128:] ^ o4
   385          # dst[160:] = in[160:] ^ o5
   386          # dst[192:] = in[192:] ^ o6
   387          # dst[224:] = in[224:] ^ o7
   388          MOVDQU(xmm_tmp0, [reg_src])
   389          MOVDQU(xmm_tmp1, [reg_src+32])
   390          PXOR(xmm_o0, xmm_tmp0)
   391          PXOR(xmm_o1, xmm_tmp1)
   392          MOVDQU(xmm_tmp0, [reg_src+64])
   393          MOVDQU(xmm_tmp1, [reg_src+96])
   394          PXOR(xmm_o2, xmm_tmp0)
   395          PXOR(xmm_o3, xmm_tmp1)
   396          MOVDQU(xmm_tmp0, [reg_src+128])
   397          MOVDQU(xmm_tmp1, [reg_src+160])
   398          PXOR(xmm_o4, xmm_tmp0)
   399          PXOR(xmm_o5, xmm_tmp1)
   400          MOVDQU(xmm_tmp0, [reg_src+192])
   401          MOVDQU(xmm_tmp1, [reg_src+224])
   402          PXOR(xmm_o6, xmm_tmp0)
   403          PXOR(xmm_o7, xmm_tmp1)
   404          MOVDQU([reg_dst], xmm_o0)
   405          MOVDQU([reg_dst+32], xmm_o1)
   406          MOVDQU([reg_dst+64], xmm_o2)
   407          MOVDQU([reg_dst+96], xmm_o3)
   408          MOVDQU([reg_dst+128], xmm_o4)
   409          MOVDQU([reg_dst+160], xmm_o5)
   410          MOVDQU([reg_dst+192], xmm_o6)
   411          MOVDQU([reg_dst+224], xmm_o7)
   412  
   413          # o0 = aes4(o0 ^ I, keys) // E(0,0)
   414          # o1 = aes4(o1 ^ I, keys) // E(0,0)
   415          # o2 = aes4(o2 ^ I, keys) // E(0,0)
   416          # o3 = aes4(o3 ^ I, keys) // E(0,0)
   417          # o4 = aes4(o4 ^ I, keys) // E(0,0)
   418          # o5 = aes4(o5 ^ I, keys) // E(0,0)
   419          # o6 = aes4(o6 ^ I, keys) // E(0,0)
   420          # o7 = aes4(o7 ^ I, keys) // E(0,0)
   421          PXOR(xmm_o0, xmm_i)
   422          PXOR(xmm_o1, xmm_i)
   423          PXOR(xmm_o2, xmm_i)
   424          PXOR(xmm_o3, xmm_i)
   425          PXOR(xmm_o4, xmm_i)
   426          PXOR(xmm_o5, xmm_i)
   427          PXOR(xmm_o6, xmm_i)
   428          PXOR(xmm_o7, xmm_i)
   429          aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
   430  
   431          # dst[ 16:] = o0 ^ in[ 16:]
   432          # dst[ 48:] = o1 ^ in[ 48:]
   433          # dst[ 80:] = o2 ^ in[ 80:]
   434          # dst[112:] = o3 ^ in[112:]
   435          # dst[144:] = o4 ^ in[144:]
   436          # dst[176:] = o5 ^ in[176:]
   437          # dst[208:] = o6 ^ in[208:]
   438          # dst[240:] = o7 ^ in[240:]
   439          MOVDQU(xmm_tmp0, [reg_src+16])
   440          MOVDQU(xmm_tmp1, [reg_src+48])
   441          PXOR(xmm_o0, xmm_tmp0)
   442          PXOR(xmm_o1, xmm_tmp1)
   443          MOVDQU(xmm_tmp0, [reg_src+80])
   444          MOVDQU(xmm_tmp1, [reg_src+112])
   445          PXOR(xmm_o2, xmm_tmp0)
   446          PXOR(xmm_o3, xmm_tmp1)
   447          MOVDQU(xmm_tmp0, [reg_src+144])
   448          MOVDQU(xmm_tmp1, [reg_src+176])
   449          PXOR(xmm_o4, xmm_tmp0)
   450          PXOR(xmm_o5, xmm_tmp1)
   451          MOVDQU(xmm_tmp0, [reg_src+208])
   452          MOVDQU(xmm_tmp1, [reg_src+240])
   453          PXOR(xmm_o6, xmm_tmp0)
   454          PXOR(xmm_o7, xmm_tmp1)
   455          MOVDQU([reg_dst+16], xmm_o0)
   456          MOVDQU([reg_dst+48], xmm_o1)
   457          MOVDQU([reg_dst+80], xmm_o2)
   458          MOVDQU([reg_dst+112], xmm_o3)
   459          MOVDQU([reg_dst+144], xmm_o4)
   460          MOVDQU([reg_dst+176], xmm_o5)
   461          MOVDQU([reg_dst+208], xmm_o6)
   462          MOVDQU([reg_dst+240], xmm_o7)
   463  
   464          # X ^= o0 ^ o1 ^ o2 ^ o3
   465          PXOR(xmm_x, xmm_o0)
   466          PXOR(xmm_x, xmm_o1)
   467          PXOR(xmm_x, xmm_o2)
   468          PXOR(xmm_x, xmm_o3)
   469          PXOR(xmm_x, xmm_o4)
   470          PXOR(xmm_x, xmm_o5)
   471          PXOR(xmm_x, xmm_o6)
   472          PXOR(xmm_x, xmm_o7)
   473  
   474          # doubleBlock(I)
   475          doubleBlock(xmm_iDbl, xmm_tmp0, xmm_tmp1, reg_tmp)
   476  
   477          # Update book keeping.
   478          ADD(reg_src, 256)
   479          ADD(reg_dst, 256)
   480          SUB(reg_bytes, 256)
   481          JAE(vector_loop256.begin)
   482      ADD(reg_bytes, 256)
   483      process_64bytes = Label()
   484      SUB(reg_bytes, 128)
   485      JB(process_64bytes)
   486  
   487      # Can I haz registers?
   488      xmm_src_l0 = xmm_tmp0
   489      xmm_src_l1 = xmm_tmp1
   490      xmm_src_r0 = xmm_o4 # Change these at your peril (tmp0 used in 8 * 16 path)
   491      xmm_src_r1 = xmm_o5
   492      xmm_src_r2 = xmm_o6
   493      xmm_src_r3 = xmm_o7
   494  
   495      #
   496      # Process 8 * 16 bytes.
   497      #
   498  
   499      # o0 = aes4(o0 ^ J ^ I ^ L[1], keys) // E(1,1)
   500      # o1 = aes4(o1 ^ J ^ I ^ L[2], keys) // E(1,2)
   501      # o2 = aes4(o2 ^ J ^ I ^ L[3], keys) // E(1,3)
   502      # o3 = aes4(o3 ^ J ^ I ^ L[4], keys) // E(1,4)
   503      MOVDQU(xmm_src_r0, [reg_src+16])
   504      MOVDQU(xmm_src_r1, [reg_src+48])
   505      MOVDQU(xmm_src_r2, [reg_src+80])
   506      MOVDQU(xmm_src_r3, [reg_src+112])
   507      MOVDQA(xmm_o0, xmm_src_r0)
   508      MOVDQA(xmm_o1, xmm_src_r1)
   509      MOVDQU(xmm_o2, xmm_src_r2)
   510      MOVDQU(xmm_o3, xmm_src_r3)
   511      MOVDQA(xmm_tmp0, xmm_j) # tmp0(src_l0) = j ^ iDbl)1
   512      PXOR(xmm_tmp0, xmm_iDbl)
   513      PXOR(xmm_o0, xmm_tmp0)
   514      PXOR(xmm_o1, xmm_tmp0)
   515      PXOR(xmm_o2, xmm_tmp0)
   516      PXOR(xmm_o3, xmm_tmp0)
   517      PXOR(xmm_o0, [reg_l+16]) # L[1]
   518      PXOR(xmm_o1, [reg_l+32]) # L[2]
   519      PXOR(xmm_o2, [reg_l+48]) # L[3]
   520      PXOR(xmm_o3, [reg_l+64]) # L[4]
   521      aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
   522  
   523      # dst[  :] = in[  :] ^ o0
   524      # dst[32:] = in[32:] ^ o1
   525      # dst[64:] = in[64:] ^ o2
   526      # dst[96:] = in[96:] ^ o3
   527      MOVDQU(xmm_src_l0, [reg_src])
   528      MOVDQU(xmm_src_l1, [reg_src+32])
   529      PXOR(xmm_o0, xmm_src_l0)
   530      PXOR(xmm_o1, xmm_src_l1)
   531      MOVDQU(xmm_src_l0, [reg_src+64])
   532      MOVDQU(xmm_src_l1, [reg_src+96])
   533      PXOR(xmm_o2, xmm_src_l0)
   534      PXOR(xmm_o3, xmm_src_l1)
   535      MOVDQU([reg_dst], xmm_o0)
   536      MOVDQU([reg_dst+32], xmm_o1)
   537      MOVDQU([reg_dst+64], xmm_o2)
   538      MOVDQU([reg_dst+96], xmm_o3)
   539  
   540      # o0 = aes4(o0 ^ I, keys) // E(0,0)
   541      # o1 = aes4(o1 ^ I, keys) // E(0,0)
   542      # o2 = aes4(o2 ^ I, keys) // E(0,0)
   543      # o3 = aes4(o3 ^ I, keys) // E(0,0)
   544      PXOR(xmm_o0, xmm_i)
   545      PXOR(xmm_o1, xmm_i)
   546      PXOR(xmm_o2, xmm_i)
   547      PXOR(xmm_o3, xmm_i)
   548      aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
   549  
   550      # dst[ 16:] = o0 ^ in[ 16:]
   551      # dst[ 48:] = o1 ^ in[ 48:]
   552      # dst[ 80:] = o2 ^ in[ 80:]
   553      # dst[112:] = o3 ^ in[112:]
   554      PXOR(xmm_o0, xmm_src_r0)
   555      PXOR(xmm_o1, xmm_src_r1)
   556      PXOR(xmm_o2, xmm_src_r2)
   557      PXOR(xmm_o3, xmm_src_r3)
   558      MOVDQU([reg_dst+16], xmm_o0)
   559      MOVDQU([reg_dst+48], xmm_o1)
   560      MOVDQU([reg_dst+80], xmm_o2)
   561      MOVDQU([reg_dst+112], xmm_o3)
   562  
   563      # X ^= o0 ^ o1 ^ o2 ^ o3
   564      PXOR(xmm_x, xmm_o0)
   565      PXOR(xmm_x, xmm_o1)
   566      PXOR(xmm_x, xmm_o2)
   567      PXOR(xmm_x, xmm_o3)
   568  
   569      # Update book keeping.
   570      ADD(reg_src, 128)
   571      ADD(reg_dst, 128)
   572      ADD(reg_idx, 4)
   573      SUB(reg_bytes, 128)
   574  
   575      LABEL(process_64bytes)
   576      ADD(reg_bytes, 128)
   577      process_32bytes = Label()
   578      SUB(reg_bytes, 64)
   579      JB(process_32bytes)
   580  
   581      #
   582      # Process 4 * 16 bytes.
   583      #
   584  
   585      reg_l_offset = reg_tmp
   586      MOV(reg_l_offset, reg_idx)
   587      SHL(reg_l_offset, 4)
   588      ADD(reg_l_offset, reg_l) # reg_l_offset = reg_l + reg_idx*16 (L[i%8])
   589  
   590      # o0 = aes4(o0 ^ J ^ I ^ L[(i+0)%8], keys) // E(1,i)
   591      # o1 = aes4(o1 ^ J ^ I ^ L[(i+1)%8], keys) // E(1,i+1)
   592      MOVDQU(xmm_src_r0, [reg_src+16])
   593      MOVDQU(xmm_src_r1, [reg_src+48])
   594      MOVDQA(xmm_o0, xmm_src_r0)
   595      MOVDQA(xmm_o1, xmm_src_r1)
   596      PXOR(xmm_o0, xmm_j)
   597      PXOR(xmm_o1, xmm_j)
   598      PXOR(xmm_o0, xmm_iDbl)
   599      PXOR(xmm_o1, xmm_iDbl)
   600      PXOR(xmm_o0, [reg_l_offset])    # L[i]
   601      PXOR(xmm_o1, [reg_l_offset+16]) # L[i+1]
   602      aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
   603  
   604      # dst[:  ] = in[:  ] ^ o0
   605      # dst[32:] = in[32:] ^ o1
   606      MOVDQU(xmm_src_l0, [reg_src])
   607      MOVDQU(xmm_src_l1, [reg_src+32])
   608      PXOR(xmm_o0, xmm_src_l0)
   609      PXOR(xmm_o1, xmm_src_l1)
   610      MOVDQU([reg_dst], xmm_o0)
   611      MOVDQU([reg_dst+32], xmm_o1)
   612  
   613      # o0 = aes4(o0 ^ I, keys) // E(0,0)
   614      # o1 = aes4(o1 ^ I, keys) // E(0,0)
   615      PXOR(xmm_o0, xmm_i)
   616      PXOR(xmm_o1, xmm_i)
   617      aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
   618  
   619      # dst[16:] = o0 ^ in[16:]
   620      # dst[48:] = o1 ^ in[48:]
   621      PXOR(xmm_o0, xmm_src_r0)
   622      PXOR(xmm_o1, xmm_src_r1)
   623      MOVDQU([reg_dst+16], xmm_o0)
   624      MOVDQU([reg_dst+48], xmm_o1)
   625  
   626      # X ^= o0 ^ o2
   627      PXOR(xmm_x, xmm_o0)
   628      PXOR(xmm_x, xmm_o1)
   629  
   630      # Update book keeping.
   631      ADD(reg_src, 64)
   632      ADD(reg_dst, 64)
   633      ADD(reg_idx, 2)
   634      SUB(reg_bytes, 64)
   635  
   636      LABEL(process_32bytes)
   637      ADD(reg_bytes, 64)
   638      out = Label()
   639      SUB(reg_bytes, 32)
   640      JB(out)
   641  
   642      #
   643      # Process 2 * 16 bytes
   644      #
   645  
   646      # Pick the final L from the table.  This is the only time
   647      # where wrapping needs to happen based on the index.
   648      AND(reg_idx, 7)
   649      SHL(reg_idx, 4)
   650      ADD(reg_l, reg_idx)       # reg_l += reg_idx (&L[i%8])
   651  
   652      # o0 = aes4(o0 ^ J ^ I ^ L[i%8], keys) // E(1,i)
   653      MOVDQU(xmm_src_r0, [reg_src+16])
   654      MOVDQA(xmm_o0, xmm_src_r0)
   655      PXOR(xmm_o0, xmm_j)
   656      PXOR(xmm_o0, xmm_iDbl)
   657      PXOR(xmm_o0, [reg_l])
   658      aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
   659  
   660      # dst[:] = in[:] ^ o0
   661      MOVDQU(xmm_src_l0, [reg_src])
   662      PXOR(xmm_o0, xmm_src_l0)
   663      MOVDQU([reg_dst], xmm_o0)
   664  
   665      # o0 = aes4(o0 ^ I, keys) // E(0,0)
   666      PXOR(xmm_o0, xmm_i)
   667      aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
   668  
   669      # dst[16:] = o0 ^ in[16:]
   670      PXOR(xmm_o0, xmm_src_r0)
   671      MOVDQU([reg_dst+16], xmm_o0)
   672  
   673      # X ^= o0
   674      PXOR(xmm_x, xmm_o0)
   675  
   676      LABEL(out)
   677  
   678      # Write back X.
   679      MOVDQU([reg_x], xmm_x)
   680  
   681      RETURN()
   682  
   683  y = Argument(ptr(uint8_t))
   684  s = Argument(ptr(const_uint8_t))
   685  
   686  with Function("aezCorePass2AMD64AESNI", (dst, y, s, j, i, l, k, consts, sz), target=uarch.zen):
   687      reg_dst = GeneralPurposeRegister64()
   688      reg_y = GeneralPurposeRegister64()
   689      reg_s = GeneralPurposeRegister64()
   690      reg_j = GeneralPurposeRegister64()
   691      reg_l = GeneralPurposeRegister64()
   692      reg_tmp = GeneralPurposeRegister64()
   693      reg_bytes = GeneralPurposeRegister64()
   694      reg_idx = GeneralPurposeRegister64()
   695      reg_sp_save = GeneralPurposeRegister64()
   696  
   697      LOAD.ARGUMENT(reg_dst, dst)  # dst pointer
   698      LOAD.ARGUMENT(reg_y, y)
   699      LOAD.ARGUMENT(reg_j, j)
   700      LOAD.ARGUMENT(reg_l, l)
   701      LOAD.ARGUMENT(reg_bytes, sz) # bytes remaining
   702      MOV(reg_idx, 1)              # Index into e.L[]
   703  
   704      xmm_j = XMMRegister()    # AESENC Round key J
   705      xmm_i = XMMRegister()    # AESENC Round key I
   706      xmm_l = XMMRegister()    # AESENC Round Key L
   707      xmm_s = XMMRegister()    # S
   708      xmm_y = XMMRegister()    # Checksum Y
   709      xmm_iDbl = XMMRegister() # e.I[1]
   710      xmm_zero = XMMRegister() # [16]byte{0x00}
   711      xmm_tmp0 = XMMRegister()
   712  
   713      o0 = XMMRegister()
   714      o1 = XMMRegister()
   715      o2 = XMMRegister()
   716      o3 = XMMRegister()
   717      o4 = XMMRegister()
   718      o5 = XMMRegister()
   719      o6 = XMMRegister()
   720      o7 = XMMRegister()
   721  
   722      LOAD.ARGUMENT(reg_tmp, k)
   723      MOVDQU(xmm_i, [reg_tmp])
   724      MOVDQU(xmm_j, [reg_tmp+16])
   725      MOVDQU(xmm_l, [reg_tmp+32])
   726  
   727      MOVDQU(xmm_y, [reg_y])
   728  
   729      LOAD.ARGUMENT(reg_tmp, i)
   730      MOVDQU(xmm_iDbl, [reg_tmp])
   731  
   732      LOAD.ARGUMENT(reg_tmp, consts)
   733  
   734      PXOR(xmm_zero, xmm_zero)
   735  
   736      LOAD.ARGUMENT(reg_s, s)
   737      MOVDQU(xmm_s, [reg_s])
   738      PXOR(xmm_s, [reg_j+16]) # S ^= J[1] (Once per call, in theory)
   739  
   740      # Save the stack pointer, align stack to 32 bytes, and allocate
   741      # 256 bytes of scratch space.
   742      MOV(reg_sp_save, registers.rsp)
   743      AND(registers.rsp, 0xffffffffffffffe0)
   744      SUB(registers.rsp, 256)
   745  
   746      # Name strategic offsets.
   747      mem_dst_l0 = [registers.rsp]
   748      mem_dst_r0 = [registers.rsp+16]
   749      mem_dst_l1 = [registers.rsp+32]
   750      mem_dst_r1 = [registers.rsp+48]
   751      mem_dst_l2 = [registers.rsp+64]
   752      mem_dst_r2 = [registers.rsp+80]
   753      mem_dst_l3 = [registers.rsp+96]
   754      mem_dst_r3 = [registers.rsp+112]
   755      mem_dst_l4 = [registers.rsp+128]
   756      mem_dst_r4 = [registers.rsp+144]
   757      mem_dst_l5 = [registers.rsp+160]
   758      mem_dst_r5 = [registers.rsp+176]
   759      mem_dst_l6 = [registers.rsp+192]
   760      mem_dst_r6 = [registers.rsp+208]
   761      mem_dst_l7 = [registers.rsp+224]
   762      mem_dst_r7 = [registers.rsp+240]
   763  
   764      #
   765      # Process 16 * 16 bytes at a time in a loop.
   766      #
   767      vector_loop256 = Loop()
   768      SUB(reg_bytes, 256)
   769      JB(vector_loop256.end)
   770      with vector_loop256:
   771          # o0 = aes4(J[1] ^ I ^ L[1] ^ S[:], keys) // E(1,1)
   772          # o1 = aes4(J[1] ^ I ^ L[2] ^ S[:], keys) // E(1,1)
   773          #  ...
   774          # o6 = aes4(J[1] ^ I ^ L[7] ^ S[:], keys) // E(1,1)
   775          # o7 = aes4(J[1] ^ I ^ L[0] ^ S[:], keys) // E(1,0)
   776          MOVDQA(xmm_o0, xmm_s)
   777          PXOR(xmm_o0, xmm_iDbl)    # o0 = s ^ I
   778          MOVDQA(xmm_o1, xmm_o0)    # o1 = o1
   779          MOVDQA(xmm_o2, xmm_o0)    # o2 = o1
   780          MOVDQA(xmm_o3, xmm_o0)    # o3 = o1
   781          MOVDQA(xmm_o4, xmm_o0)    # o1 = o1
   782          MOVDQA(xmm_o5, xmm_o0)    # o2 = o1
   783          MOVDQA(xmm_o6, xmm_o0)    # o3 = o1
   784          MOVDQA(xmm_o7, xmm_o0)    # o3 = o1
   785          PXOR(xmm_o0, [reg_l+16])  # o0 ^= L[1]
   786          PXOR(xmm_o1, [reg_l+32])  # o1 ^= L[2]
   787          PXOR(xmm_o2, [reg_l+48])  # o2 ^= L[3]
   788          PXOR(xmm_o3, [reg_l+64])  # o3 ^= L[4]
   789          PXOR(xmm_o4, [reg_l+80])  # o4 ^= L[5]
   790          PXOR(xmm_o5, [reg_l+96])  # o5 ^= L[6]
   791          PXOR(xmm_o6, [reg_l+112]) # o6 ^= L[7]
   792          PXOR(xmm_o7, [reg_l])     # o7 ^= L[0]
   793          aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
   794  
   795          # TODO: Figure out how the fuck to remove some of these loads/stores.
   796          xmm_tmp1 = xmm_s # Use as scratch till the end of loop body.
   797  
   798          # dst_l0 ^= o0, ... dst_l7 ^= o7
   799          # Y ^= dst_l0 ^ ... ^ dst_l7
   800          MOVDQU(xmm_tmp0, [reg_dst])
   801          MOVDQU(xmm_tmp1, [reg_dst+32])
   802          PXOR(xmm_tmp0, xmm_o0)
   803          PXOR(xmm_tmp1, xmm_o1)
   804          PXOR(xmm_y, xmm_tmp0)
   805          PXOR(xmm_y, xmm_tmp1)
   806          MOVDQA(mem_dst_l0, xmm_tmp0)
   807          MOVDQA(mem_dst_l1, xmm_tmp1)
   808  
   809          MOVDQU(xmm_tmp0, [reg_dst+64])
   810          MOVDQU(xmm_tmp1, [reg_dst+96])
   811          PXOR(xmm_tmp0, xmm_o2)
   812          PXOR(xmm_tmp1, xmm_o3)
   813          PXOR(xmm_y, xmm_tmp0)
   814          PXOR(xmm_y, xmm_tmp1)
   815          MOVDQA(mem_dst_l2, xmm_tmp0)
   816          MOVDQA(mem_dst_l3, xmm_tmp1)
   817  
   818          MOVDQU(xmm_tmp0, [reg_dst+128])
   819          MOVDQU(xmm_tmp1, [reg_dst+160])
   820          PXOR(xmm_tmp0, xmm_o4)
   821          PXOR(xmm_tmp1, xmm_o5)
   822          PXOR(xmm_y, xmm_tmp0)
   823          PXOR(xmm_y, xmm_tmp1)
   824          MOVDQA(mem_dst_l4, xmm_tmp0)
   825          MOVDQA(mem_dst_l5, xmm_tmp1)
   826  
   827          MOVDQU(xmm_tmp0, [reg_dst+192])
   828          MOVDQU(xmm_tmp1, [reg_dst+224])
   829          PXOR(xmm_tmp0, xmm_o6)
   830          PXOR(xmm_tmp1, xmm_o7)
   831          PXOR(xmm_y, xmm_tmp0)
   832          PXOR(xmm_y, xmm_tmp1)
   833          MOVDQA(mem_dst_l6, xmm_tmp0)
   834          MOVDQA(mem_dst_l7, xmm_tmp1)
   835  
   836          # o0 ^= dst_r0, ... o7 ^= dst_r7
   837          # dst_r0 = o0, ... dst_r7 = o7
   838          MOVDQU(xmm_tmp0, [reg_dst+16])
   839          MOVDQU(xmm_tmp1, [reg_dst+48])
   840          PXOR(xmm_o0, xmm_tmp0)
   841          PXOR(xmm_o1, xmm_tmp1)
   842          MOVDQA(mem_dst_r0, xmm_o0)
   843          MOVDQA(mem_dst_r1, xmm_o1)
   844  
   845          MOVDQU(xmm_tmp0, [reg_dst+80])
   846          MOVDQU(xmm_tmp1, [reg_dst+112])
   847          PXOR(xmm_o2, xmm_tmp0)
   848          PXOR(xmm_o3, xmm_tmp1)
   849          MOVDQA(mem_dst_r2, xmm_o2)
   850          MOVDQA(mem_dst_r3, xmm_o3)
   851  
   852          MOVDQU(xmm_tmp0, [reg_dst+144])
   853          MOVDQU(xmm_tmp1, [reg_dst+176])
   854          PXOR(xmm_o4, xmm_tmp0)
   855          PXOR(xmm_o5, xmm_tmp1)
   856          MOVDQA(mem_dst_r4, xmm_o4)
   857          MOVDQA(mem_dst_r5, xmm_o5)
   858  
   859          MOVDQU(xmm_tmp0, [reg_dst+208])
   860          MOVDQU(xmm_tmp1, [reg_dst+240])
   861          PXOR(xmm_o6, xmm_tmp0)
   862          PXOR(xmm_o7, xmm_tmp1)
   863          MOVDQA(mem_dst_r6, xmm_o6)
   864          MOVDQA(mem_dst_r7, xmm_o7)
   865  
   866          # o0 = aes4(o0 ^ I[0]) // E(0,0)
   867          #  ...
   868          # o7 = aes4(o7 ^ I[0]) // E(0,0)
   869          PXOR(xmm_o0, xmm_i)
   870          PXOR(xmm_o1, xmm_i)
   871          PXOR(xmm_o2, xmm_i)
   872          PXOR(xmm_o3, xmm_i)
   873          PXOR(xmm_o4, xmm_i)
   874          PXOR(xmm_o5, xmm_i)
   875          PXOR(xmm_o6, xmm_i)
   876          PXOR(xmm_o7, xmm_i)
   877          aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
   878  
   879          # o0 ^= dst_l0, ... o7 ^= dst_l7
   880          # dst_l0 = o0, ... dst_l7 = o7
   881          #
   882          # nb: Stored into the right hand blocks of dst[], because we are
   883          # done with the left hand side.
   884          PXOR(xmm_o0, mem_dst_l0)
   885          PXOR(xmm_o1, mem_dst_l1)
   886          PXOR(xmm_o2, mem_dst_l2)
   887          PXOR(xmm_o3, mem_dst_l3)
   888          PXOR(xmm_o4, mem_dst_l4)
   889          PXOR(xmm_o5, mem_dst_l5)
   890          PXOR(xmm_o6, mem_dst_l6)
   891          PXOR(xmm_o7, mem_dst_l7)
   892          MOVDQU([reg_dst+16], xmm_o0)
   893          MOVDQU([reg_dst+48], xmm_o1)
   894          MOVDQU([reg_dst+80], xmm_o2)
   895          MOVDQU([reg_dst+112], xmm_o3)
   896          MOVDQU([reg_dst+144], xmm_o4)
   897          MOVDQU([reg_dst+176], xmm_o5)
   898          MOVDQU([reg_dst+208], xmm_o6)
   899          MOVDQU([reg_dst+240], xmm_o7)
   900  
   901          # o0 = aes4(o0 ^ J[0] ^ I ^ L[1]) // E(1,1)
   902          # o1 = aes4(o0 ^ J[0] ^ I ^ L[2]) // E(1,2)
   903          #  ...
   904          # o6 = aes4(o0 ^ J[0] ^ I ^ L[7]) // E(1,7)
   905          # o7 = aes4(o0 ^ J[0] ^ I ^ L[0]) // E(1,0)
   906          MOVDQA(xmm_tmp0, [reg_j])
   907          PXOR(xmm_tmp0, xmm_iDbl)  # tmp = J[0] ^ I
   908          PXOR(xmm_o0, xmm_tmp0)    # o0 ^= tmp
   909          PXOR(xmm_o1, xmm_tmp0)    # o1 ^= tmp
   910          PXOR(xmm_o2, xmm_tmp0)    # o2 ^= tmp
   911          PXOR(xmm_o3, xmm_tmp0)    # o3 ^= tmp
   912          PXOR(xmm_o4, xmm_tmp0)    # o4 ^= tmp
   913          PXOR(xmm_o5, xmm_tmp0)    # o5 ^= tmp
   914          PXOR(xmm_o6, xmm_tmp0)    # o6 ^= tmp
   915          PXOR(xmm_o7, xmm_tmp0)    # o7 ^= tmp
   916          PXOR(xmm_o0, [reg_l+16])  # o0 ^= L[1]
   917          PXOR(xmm_o1, [reg_l+32])  # o1 ^= L[2]
   918          PXOR(xmm_o2, [reg_l+48])  # o2 ^= L[3]
   919          PXOR(xmm_o3, [reg_l+64])  # o3 ^= L[4]
   920          PXOR(xmm_o4, [reg_l+80])  # o4 ^= L[5]
   921          PXOR(xmm_o5, [reg_l+96])  # o5 ^= L[6]
   922          PXOR(xmm_o6, [reg_l+112]) # o6 ^= L[7]
   923          PXOR(xmm_o7, [reg_l])     # o7 ^= L[0]
   924          aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero)
   925  
   926          # dst_r0 ^= o0, ... dst_r7 ^= o7
   927          # dst_l0, dst_r0 = dst_r0, dst_l0 ... dst_l7, dst_r7 = dst_r7, dst_l7
   928          #
   929          # nb: dst_l0 ... dst_l7 already written after the previous aesenc4x8
   930          # call.
   931          PXOR(xmm_o0, mem_dst_r0)
   932          PXOR(xmm_o1, mem_dst_r1)
   933          PXOR(xmm_o2, mem_dst_r2)
   934          PXOR(xmm_o3, mem_dst_r3)
   935          PXOR(xmm_o4, mem_dst_r4)
   936          PXOR(xmm_o5, mem_dst_r5)
   937          PXOR(xmm_o6, mem_dst_r6)
   938          PXOR(xmm_o7, mem_dst_r7)
   939          MOVDQU([reg_dst], xmm_o0)
   940          MOVDQU([reg_dst+32], xmm_o1)
   941          MOVDQU([reg_dst+64], xmm_o2)
   942          MOVDQU([reg_dst+96], xmm_o3)
   943          MOVDQU([reg_dst+128], xmm_o4)
   944          MOVDQU([reg_dst+160], xmm_o5)
   945          MOVDQU([reg_dst+192], xmm_o6)
   946          MOVDQU([reg_dst+224], xmm_o7)
   947  
   948          # doubleBlock(I)
   949          doubleBlock(xmm_iDbl, xmm_tmp0, xmm_tmp1, reg_tmp)
   950  
   951          MOVDQU(xmm_s, [reg_s])
   952          PXOR(xmm_s, [reg_j+16])  # Re-derive since it was used as scratch space.
   953  
   954          # Update book keeping.
   955          ADD(reg_dst, 256)
   956          SUB(reg_bytes, 256)
   957          JAE(vector_loop256.begin)
   958  
   959          # Purge the scratch space that we are done with.
   960          MOVDQA(mem_dst_r0, xmm_zero)
   961          MOVDQA(mem_dst_r1, xmm_zero)
   962          MOVDQA(mem_dst_r2, xmm_zero)
   963          MOVDQA(mem_dst_r3, xmm_zero)
   964          MOVDQA(mem_dst_l4, xmm_zero)
   965          MOVDQA(mem_dst_r4, xmm_zero)
   966          MOVDQA(mem_dst_l5, xmm_zero)
   967          MOVDQA(mem_dst_r5, xmm_zero)
   968          MOVDQA(mem_dst_l6, xmm_zero)
   969          MOVDQA(mem_dst_r6, xmm_zero)
   970          MOVDQA(mem_dst_l7, xmm_zero)
   971          MOVDQA(mem_dst_r7, xmm_zero)
   972      ADD(reg_bytes, 256)
   973      process_64bytes = Label()
   974      SUB(reg_bytes, 128)
   975      JB(process_64bytes)
   976  
   977      # Can I haz registers?
   978      xmm_dst_l0 = xmm_o4
   979      xmm_dst_r0 = xmm_o5
   980      xmm_dst_l1 = xmm_o6
   981      xmm_dst_r1 = xmm_o7
   982  
   983      #
   984      # Process 8 * 16 bytes.
   985      #
   986  
   987      # o0 = aes4(J[1] ^ I ^ L[1] ^ S[:], keys) // E(1,1)
   988      # o1 = aes4(J[1] ^ I ^ L[2] ^ S[:], keys) // E(1,2)
   989      # o2 = aes4(J[1] ^ I ^ L[3] ^ S[:], keys) // E(1,3)
   990      # o3 = aes4(J[1] ^ I ^ L[4] ^ S[:], keys) // E(1,4)
   991      MOVDQA(xmm_o0, xmm_s)
   992      PXOR(xmm_o0, xmm_iDbl)   # o0 = s ^ I
   993      MOVDQA(xmm_o1, xmm_o0)   # o1 = o0
   994      MOVDQA(xmm_o2, xmm_o0)   # o2 = o0
   995      MOVDQA(xmm_o3, xmm_o0)   # o3 = o0
   996      PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1]
   997      PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2]
   998      PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3]
   999      PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4]
  1000      aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
  1001  
  1002      # Load the left halfs of the dsts into registers.
  1003      xmm_dst_l2 = xmm_dst_r0
  1004      xmm_dst_l3 = xmm_dst_r1
  1005      MOVDQU(xmm_dst_l0, [reg_dst])    # dst_l0 = dst[:]
  1006      MOVDQU(xmm_dst_l1, [reg_dst+32]) # dst_l1 = dst[32:]
  1007      MOVDQU(xmm_dst_l2, [reg_dst+64]) # dst_l2 = dst[64:]
  1008      MOVDQU(xmm_dst_l3, [reg_dst+96]) # dst_l3 = dst[96:]
  1009  
  1010      # dst_l0 ^= o0, ... dst_l3 ^= o3
  1011      PXOR(xmm_dst_l0, xmm_o0)
  1012      PXOR(xmm_dst_l1, xmm_o1)
  1013      PXOR(xmm_dst_l2, xmm_o2)
  1014      PXOR(xmm_dst_l3, xmm_o3)
  1015  
  1016      # Y ^= dst_l0 ^ ... ^ dst_l3
  1017      PXOR(xmm_y, xmm_dst_l0)
  1018      PXOR(xmm_y, xmm_dst_l1)
  1019      PXOR(xmm_y, xmm_dst_l2)
  1020      PXOR(xmm_y, xmm_dst_l3)
  1021  
  1022      # Store the altered left halfs.
  1023      MOVDQA(mem_dst_l0, xmm_dst_l0)
  1024      MOVDQA(mem_dst_l1, xmm_dst_l1)
  1025      MOVDQA(mem_dst_l2, xmm_dst_l2)
  1026      MOVDQA(mem_dst_l3, xmm_dst_l3)
  1027  
  1028      # Load the right halfs of dst into registers.
  1029      xmm_dst_r2 = xmm_dst_l0
  1030      xmm_dst_r3 = xmm_dst_l1
  1031      MOVDQU(xmm_dst_r0, [reg_dst+16])  # dst_r0 = dst[ 16:]
  1032      MOVDQU(xmm_dst_r1, [reg_dst+48])  # dst_r1 = dst[ 48:]
  1033      MOVDQU(xmm_dst_r2, [reg_dst+80])  # dst_r2 = dst[ 80:]
  1034      MOVDQU(xmm_dst_r3, [reg_dst+112]) # dst_r3 = dst[112:]
  1035  
  1036      # o0 ^= dst_r0, ... o3 ^= dst_r3
  1037      # dst_r0 = o0, ... dst_r3 = o3
  1038      PXOR(xmm_o0, xmm_dst_r0)
  1039      PXOR(xmm_o1, xmm_dst_r1)
  1040      PXOR(xmm_o2, xmm_dst_r2)
  1041      PXOR(xmm_o3, xmm_dst_r3)
  1042      MOVDQU([reg_dst+16], xmm_o0)
  1043      MOVDQU([reg_dst+48], xmm_o1)
  1044      MOVDQU([reg_dst+80], xmm_o2)
  1045      MOVDQU([reg_dst+112], xmm_o3)
  1046      MOVDQA(xmm_dst_r0, xmm_o0)
  1047      MOVDQA(xmm_dst_r1, xmm_o1)
  1048      MOVDQA(xmm_dst_r2, xmm_o2)
  1049      MOVDQA(xmm_dst_r3, xmm_o3)
  1050  
  1051      # o0 = aes4(o0 ^ I[0]) // E(0,0)
  1052      #  ...
  1053      # o3 = aes4(o3 ^ I[0]) // E(0,0)
  1054      PXOR(xmm_o0, xmm_i)
  1055      PXOR(xmm_o1, xmm_i)
  1056      PXOR(xmm_o2, xmm_i)
  1057      PXOR(xmm_o3, xmm_i)
  1058      aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
  1059  
  1060      # o0 ^= dst_l0, ... o3 ^= dst_l3
  1061      # dst_l0 = o0, ... dst_l3 = o3
  1062      #
  1063      # nb: Stored into the right hand blocks of dst[], because we are
  1064      # done with the left hand side.
  1065      PXOR(xmm_o0, mem_dst_l0)
  1066      PXOR(xmm_o1, mem_dst_l1)
  1067      PXOR(xmm_o2, mem_dst_l2)
  1068      PXOR(xmm_o3, mem_dst_l3)
  1069      MOVDQU([reg_dst+16], xmm_o0)
  1070      MOVDQU([reg_dst+48], xmm_o1)
  1071      MOVDQU([reg_dst+80], xmm_o2)
  1072      MOVDQU([reg_dst+112], xmm_o3)
  1073  
  1074      # o0 = aes4(o0 ^ J[0] ^ I ^ L[1]) // E(1,1)
  1075      # o1 = aes4(o1 ^ J[0] ^ I ^ L[2]) // E(1,2)
  1076      # o2 = aes4(o2 ^ J[0] ^ I ^ L[3]) // E(1,3)
  1077      # o3 = aes4(o3 ^ J[0] ^ I ^ L[4]) // E(1,4)
  1078      PXOR(xmm_o0, [reg_j])
  1079      PXOR(xmm_o1, [reg_j])
  1080      PXOR(xmm_o2, [reg_j])
  1081      PXOR(xmm_o3, [reg_j])
  1082      PXOR(xmm_o0, xmm_iDbl)
  1083      PXOR(xmm_o1, xmm_iDbl)
  1084      PXOR(xmm_o2, xmm_iDbl)
  1085      PXOR(xmm_o3, xmm_iDbl)
  1086      PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1]
  1087      PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2]
  1088      PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3]
  1089      PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4]
  1090      aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero)
  1091  
  1092      # dst_r0 ^= o0, ... dst_r3 ^= o3
  1093      # dst_l0, dst_r0 = dst_r0, dst_l0 ... dst_l3, dst_r3 = dst_r, dst_l3
  1094      #
  1095      # nb: dst_l0 ... dst_l7 already written after the previous aesenc4x4
  1096      # call.
  1097      PXOR(xmm_o0, xmm_dst_r0)
  1098      PXOR(xmm_o1, xmm_dst_r1)
  1099      PXOR(xmm_o2, xmm_dst_r2)
  1100      PXOR(xmm_o3, xmm_dst_r3)
  1101      MOVDQU([reg_dst], xmm_o0)
  1102      MOVDQU([reg_dst+32], xmm_o1)
  1103      MOVDQU([reg_dst+64], xmm_o2)
  1104      MOVDQU([reg_dst+96], xmm_o3)
  1105  
  1106      # Update book keeping.
  1107      ADD(reg_dst, 128)
  1108      ADD(reg_idx, 4)
  1109      SUB(reg_bytes, 128)
  1110  
  1111      LABEL(process_64bytes)
  1112      ADD(reg_bytes, 128)
  1113      process_32bytes = Label()
  1114      SUB(reg_bytes, 64)
  1115      JB(process_32bytes)
  1116  
  1117      #
  1118      # Process 4 * 16 bytes.
  1119      #
  1120      # (Scratch space unused past this point, working set fits into registers.)
  1121      #
  1122  
  1123      reg_l_offset = reg_tmp
  1124      MOV(reg_l_offset, reg_idx)
  1125      SHL(reg_l_offset, 4)
  1126      ADD(reg_l_offset, reg_l) # reg_l_offset = reg_l + reg_idx*16 (L[i%8])
  1127  
  1128      # o0 = aes4(J[1] ^ I ^ L[(i+0)%8] ^ S[:], keys) // E(1,i)
  1129      # o1 = aes4(J[1] ^ I ^ L[(i+1)%8] ^ S[:], keys) // E(1,i+1)
  1130      MOVDQA(xmm_o0, xmm_s)
  1131      PXOR(xmm_o0, xmm_iDbl)          # o0 = s ^ I
  1132      MOVDQA(xmm_o1, xmm_o0)          # o1 = o0
  1133      PXOR(xmm_o0, [reg_l_offset])    # o0 ^= L[i]
  1134      PXOR(xmm_o1, [reg_l_offset+16]) # o1 ^= L[i+1]
  1135      aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
  1136  
  1137      # Load dst into registers.
  1138      MOVDQU(xmm_dst_l0, [reg_dst])    # dst_l0 = dst[:]
  1139      MOVDQU(xmm_dst_r0, [reg_dst+16]) # dst_r0 = dst[16:]
  1140      MOVDQU(xmm_dst_l1, [reg_dst+32]) # dst_l1 = dst[32:]
  1141      MOVDQU(xmm_dst_r1, [reg_dst+48]) # dst_r1 = dst[48:]
  1142  
  1143      # dst_l0 ^= o0, dst_l1 ^= o1
  1144      PXOR(xmm_dst_l0, xmm_o0)
  1145      PXOR(xmm_dst_l1, xmm_o1)
  1146  
  1147      # Y ^= dst_l0 ^ dst_l1
  1148      PXOR(xmm_y, xmm_dst_l0)
  1149      PXOR(xmm_y, xmm_dst_l1)
  1150  
  1151      # o0 ^= dst_r0, o1 ^= dst_r1
  1152      # dst_r0 = o0, dst_r1 = o1
  1153      PXOR(xmm_o0, xmm_dst_r0)
  1154      PXOR(xmm_o1, xmm_dst_r1)
  1155      MOVDQA(xmm_dst_r0, xmm_o0)
  1156      MOVDQA(xmm_dst_r1, xmm_o1)
  1157  
  1158      # o0 = aes4(o0 ^ I[0]) // E(0,0)
  1159      # o1 = aes4(o1 ^ I[0]) // E(0,0)
  1160      PXOR(xmm_o0, xmm_i)
  1161      PXOR(xmm_o1, xmm_i)
  1162      aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
  1163  
  1164      # o0 ^= dst_l0, o1 ^= dst_;1
  1165      # dst_l0 = o0, dst_l1 = o1
  1166      PXOR(xmm_o0, xmm_dst_l0)
  1167      PXOR(xmm_o1, xmm_dst_l1)
  1168      MOVDQA(xmm_dst_l0, xmm_o0)
  1169      MOVDQA(xmm_dst_l1, xmm_o1)
  1170  
  1171      # o0 = aes4(o0 ^ J[0] ^ I ^ L[(i+0)%8]) // E(1,i)
  1172      # o1 = aes4(o1 ^ J[0] ^ I ^ L[(i+1)%8]) // E(1,i+1)
  1173      PXOR(xmm_o0, [reg_j])
  1174      PXOR(xmm_o1, [reg_j])
  1175      PXOR(xmm_o0, xmm_iDbl)
  1176      PXOR(xmm_o1, xmm_iDbl)
  1177      PXOR(xmm_o0, [reg_tmp])
  1178      PXOR(xmm_o1, [reg_tmp+16])
  1179      aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero)
  1180  
  1181      # dst_r0 ^= o0
  1182      # dst_r1 ^= o1
  1183      PXOR(xmm_dst_r0, xmm_o0)
  1184      PXOR(xmm_dst_r1, xmm_o1)
  1185  
  1186      # dst_l0, dst_r0 = dst_r0, dst_l0 .. dst_l1, dst_r1 = dst_r1, dst_l1
  1187      MOVDQU([reg_dst], xmm_dst_r0)
  1188      MOVDQU([reg_dst+16], xmm_dst_l0)
  1189      MOVDQU([reg_dst+32], xmm_dst_r1)
  1190      MOVDQU([reg_dst+48], xmm_dst_l1)
  1191  
  1192      # Update book keeping.
  1193      ADD(reg_dst, 64)
  1194      ADD(reg_idx, 2)
  1195      SUB(reg_bytes, 64)
  1196  
  1197      LABEL(process_32bytes)
  1198      ADD(reg_bytes, 64)
  1199      out = Label()
  1200      SUB(reg_bytes, 32)
  1201      JB(out)
  1202  
  1203      #
  1204      # Process 2 * 16 bytes
  1205      #
  1206  
  1207      # Pick the final L from the table.  This is the only time
  1208      # where wrapping needs to happen based on the index.
  1209      AND(reg_idx, 7)
  1210      SHL(reg_idx, 4)
  1211      ADD(reg_l, reg_idx)       # reg_l += reg_idx (&L[i%8])
  1212  
  1213      # o0 = aes4(J[1] ^ I ^ L[i%8] ^ S[:], keys) // E(1,i)
  1214      MOVDQA(xmm_o0, xmm_s)  # o0 = s
  1215      PXOR(xmm_o0, xmm_iDbl) # o0 ^= I
  1216      PXOR(xmm_o0, [reg_l])  # L[i%8]
  1217      aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
  1218  
  1219      # Load dst into registers.
  1220      MOVDQU(xmm_dst_l0, [reg_dst])    # dst_l = dst[:]
  1221      MOVDQU(xmm_dst_r0, [reg_dst+16]) # dst_r = dst[16:]
  1222  
  1223      # dst_l ^= o0
  1224      PXOR(xmm_dst_l0, xmm_o0)
  1225  
  1226      # Y ^= dst_l
  1227      PXOR(xmm_y, xmm_dst_l0)
  1228  
  1229      # dst_r ^= o0
  1230      PXOR(xmm_o0, xmm_dst_r0)
  1231      MOVDQA(xmm_dst_r0, xmm_o0) # o0 = dst_r
  1232  
  1233      # o0 = aes4(o0 ^ I[0]) // E(0,0)
  1234      PXOR(xmm_o0, xmm_i)
  1235      aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
  1236  
  1237      # dst_l ^= o0
  1238      PXOR(xmm_o0, xmm_dst_l0)
  1239      MOVDQA(xmm_dst_l0, xmm_o0) # o0 = dst_l
  1240  
  1241      # o0 = aes4(o0 ^ J[0] ^ I ^ L[i%8]) // E(1,i)
  1242      PXOR(xmm_o0, [reg_j])
  1243      PXOR(xmm_o0, xmm_iDbl)
  1244      PXOR(xmm_o0, [reg_l])
  1245      aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero)
  1246  
  1247      # dst_r ^= o0
  1248      PXOR(xmm_dst_r0, xmm_o0)
  1249  
  1250      # dst_l, dst_r = dst_r, dst_l
  1251      MOVDQU([reg_dst], xmm_dst_r0)
  1252      MOVDQU([reg_dst+16], xmm_dst_l0)
  1253  
  1254      LABEL(out)
  1255  
  1256      # Write back Y.
  1257      MOVDQU([reg_y], xmm_y)
  1258  
  1259      # Paranoia, cleanse the scratch space.  Most of it is purged
  1260      # at the end of the 16x16 loop, but the 8x16 case uses these 4.
  1261      MOVDQA(mem_dst_l0, xmm_zero)
  1262      MOVDQA(mem_dst_l1, xmm_zero)
  1263      MOVDQA(mem_dst_l2, xmm_zero)
  1264      MOVDQA(mem_dst_l3, xmm_zero)
  1265  
  1266      # Restore the stack pointer.
  1267      MOV(registers.rsp, reg_sp_save)
  1268  
  1269      RETURN()