github.com/Psiphon-Labs/psiphon-tunnel-core@v2.0.28+incompatible/psiphon/common/crypto/Yawning/chacha20/chacha20_amd64.py (about)

     1  #!/usr/bin/env python3
     2  #
     3  # To the extent possible under law, Yawning Angel has waived all copyright
     4  # and related or neighboring rights to chacha20, using the Creative
     5  # Commons "CC0" public domain dedication. See LICENSE or
     6  # <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
     7  
     8  #
     9  # cgo sucks.  Plan 9 assembly sucks.  Real languages have SIMD intrinsics.
    10  # The least terrible/retarded option is to use a Python code generator, so
    11  # that's what I did.
    12  #
    13  # Code based on Ted Krovetz's vec128 C implementation, with corrections
    14  # to use a 64 bit counter instead of 32 bit, and to allow unaligned input and
    15  # output pointers.
    16  #
    17  # Dependencies: https://github.com/Maratyszcza/PeachPy
    18  #
    19  # python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py
    20  #
    21  
    22  from peachpy import *
    23  from peachpy.x86_64 import *
    24  
    25  x = Argument(ptr(uint32_t))
    26  inp = Argument(ptr(const_uint8_t))
    27  outp = Argument(ptr(uint8_t))
    28  nrBlocks = Argument(ptr(size_t))
    29  
    30  #
    31  # SSE2 helper functions.  A temporary register is explicitly passed in because
    32  # the main fast loop uses every single register (and even spills) so manual
    33  # control is needed.
    34  #
    35  # This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like
    36  # in the C code, but the C code has the luxury of an optimizer reordering
    37  # everything, while this does not.
    38  #
    39  
    40  def ROTW16_sse2(tmp, d):
    41      MOVDQA(tmp, d)
    42      PSLLD(tmp, 16)
    43      PSRLD(d, 16)
    44      PXOR(d, tmp)
    45  
    46  def ROTW12_sse2(tmp, b):
    47      MOVDQA(tmp, b)
    48      PSLLD(tmp, 12)
    49      PSRLD(b, 20)
    50      PXOR(b, tmp)
    51  
    52  def ROTW8_sse2(tmp, d):
    53      MOVDQA(tmp, d)
    54      PSLLD(tmp, 8)
    55      PSRLD(d, 24)
    56      PXOR(d, tmp)
    57  
    58  def ROTW7_sse2(tmp, b):
    59      MOVDQA(tmp, b)
    60      PSLLD(tmp, 7)
    61      PSRLD(b, 25)
    62      PXOR(b, tmp)
    63  
    64  def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3):
    65      MOVDQU(tmp, [inp+d])
    66      PXOR(tmp, v0)
    67      MOVDQU([outp+d], tmp)
    68      MOVDQU(tmp, [inp+d+16])
    69      PXOR(tmp, v1)
    70      MOVDQU([outp+d+16], tmp)
    71      MOVDQU(tmp, [inp+d+32])
    72      PXOR(tmp, v2)
    73      MOVDQU([outp+d+32], tmp)
    74      MOVDQU(tmp, [inp+d+48])
    75      PXOR(tmp, v3)
    76      MOVDQU([outp+d+48], tmp)
    77  
    78  # SSE2 ChaCha20 (aka vec128).  Does not handle partial blocks, and will
    79  # process 4/2/1 blocks at a time.
    80  with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
    81      reg_x = GeneralPurposeRegister64()
    82      reg_inp = GeneralPurposeRegister64()
    83      reg_outp = GeneralPurposeRegister64()
    84      reg_blocks = GeneralPurposeRegister64()
    85      reg_sp_save = GeneralPurposeRegister64()
    86  
    87      LOAD.ARGUMENT(reg_x, x)
    88      LOAD.ARGUMENT(reg_inp, inp)
    89      LOAD.ARGUMENT(reg_outp, outp)
    90      LOAD.ARGUMENT(reg_blocks, nrBlocks)
    91  
    92      # Align the stack to a 32 byte boundary.
    93      MOV(reg_sp_save, registers.rsp)
    94      AND(registers.rsp, 0xffffffffffffffe0)
    95      SUB(registers.rsp, 0x20)
    96  
    97      # Build the counter increment vector on the stack, and allocate the scratch
    98      # space
    99      xmm_v0 = XMMRegister()
   100      PXOR(xmm_v0, xmm_v0)
   101      SUB(registers.rsp, 16+16)
   102      MOVDQA([registers.rsp], xmm_v0)
   103      reg_tmp = GeneralPurposeRegister32()
   104      MOV(reg_tmp, 0x00000001)
   105      MOV([registers.rsp], reg_tmp)
   106      mem_one = [registers.rsp]     # (Stack) Counter increment vector
   107      mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space.
   108  
   109      mem_s0 = [reg_x]           # (Memory) Cipher state [0..3]
   110      mem_s1 = [reg_x+16]        # (Memory) Cipher state [4..7]
   111      mem_s2 = [reg_x+32]        # (Memory) Cipher state [8..11]
   112      mem_s3 = [reg_x+48]        # (Memory) Cipher state [12..15]
   113  
   114      # xmm_v0 allocated above...
   115      xmm_v1 = XMMRegister()
   116      xmm_v2 = XMMRegister()
   117      xmm_v3 = XMMRegister()
   118  
   119      xmm_v4 = XMMRegister()
   120      xmm_v5 = XMMRegister()
   121      xmm_v6 = XMMRegister()
   122      xmm_v7 = XMMRegister()
   123  
   124      xmm_v8 = XMMRegister()
   125      xmm_v9 = XMMRegister()
   126      xmm_v10 = XMMRegister()
   127      xmm_v11 = XMMRegister()
   128  
   129      xmm_v12 = XMMRegister()
   130      xmm_v13 = XMMRegister()
   131      xmm_v14 = XMMRegister()
   132      xmm_v15 = XMMRegister()
   133  
   134      xmm_tmp = xmm_v12
   135  
   136      #
   137      # 4 blocks at a time.
   138      #
   139  
   140      reg_rounds = GeneralPurposeRegister64()
   141  
   142      vector_loop4 = Loop()
   143      SUB(reg_blocks, 4)
   144      JB(vector_loop4.end)
   145      with vector_loop4:
   146          MOVDQU(xmm_v0, mem_s0)
   147          MOVDQU(xmm_v1, mem_s1)
   148          MOVDQU(xmm_v2, mem_s2)
   149          MOVDQU(xmm_v3, mem_s3)
   150  
   151          MOVDQA(xmm_v4, xmm_v0)
   152          MOVDQA(xmm_v5, xmm_v1)
   153          MOVDQA(xmm_v6, xmm_v2)
   154          MOVDQA(xmm_v7, xmm_v3)
   155          PADDQ(xmm_v7, mem_one)
   156  
   157          MOVDQA(xmm_v8, xmm_v0)
   158          MOVDQA(xmm_v9, xmm_v1)
   159          MOVDQA(xmm_v10, xmm_v2)
   160          MOVDQA(xmm_v11, xmm_v7)
   161          PADDQ(xmm_v11, mem_one)
   162  
   163          MOVDQA(xmm_v12, xmm_v0)
   164          MOVDQA(xmm_v13, xmm_v1)
   165          MOVDQA(xmm_v14, xmm_v2)
   166          MOVDQA(xmm_v15, xmm_v11)
   167          PADDQ(xmm_v15, mem_one)
   168  
   169          MOV(reg_rounds, 20)
   170          rounds_loop4 = Loop()
   171          with rounds_loop4:
   172              # a += b; d ^= a; d = ROTW16(d);
   173              PADDD(xmm_v0, xmm_v1)
   174              PADDD(xmm_v4, xmm_v5)
   175              PADDD(xmm_v8, xmm_v9)
   176              PADDD(xmm_v12, xmm_v13)
   177              PXOR(xmm_v3, xmm_v0)
   178              PXOR(xmm_v7, xmm_v4)
   179              PXOR(xmm_v11, xmm_v8)
   180              PXOR(xmm_v15, xmm_v12)
   181  
   182              MOVDQA(mem_tmp0, xmm_tmp) # Save
   183  
   184              ROTW16_sse2(xmm_tmp, xmm_v3)
   185              ROTW16_sse2(xmm_tmp, xmm_v7)
   186              ROTW16_sse2(xmm_tmp, xmm_v11)
   187              ROTW16_sse2(xmm_tmp, xmm_v15)
   188  
   189              # c += d; b ^= c; b = ROTW12(b);
   190              PADDD(xmm_v2, xmm_v3)
   191              PADDD(xmm_v6, xmm_v7)
   192              PADDD(xmm_v10, xmm_v11)
   193              PADDD(xmm_v14, xmm_v15)
   194              PXOR(xmm_v1, xmm_v2)
   195              PXOR(xmm_v5, xmm_v6)
   196              PXOR(xmm_v9, xmm_v10)
   197              PXOR(xmm_v13, xmm_v14)
   198              ROTW12_sse2(xmm_tmp, xmm_v1)
   199              ROTW12_sse2(xmm_tmp, xmm_v5)
   200              ROTW12_sse2(xmm_tmp, xmm_v9)
   201              ROTW12_sse2(xmm_tmp, xmm_v13)
   202  
   203              # a += b; d ^= a; d = ROTW8(d);
   204              MOVDQA(xmm_tmp, mem_tmp0) # Restore
   205  
   206              PADDD(xmm_v0, xmm_v1)
   207              PADDD(xmm_v4, xmm_v5)
   208              PADDD(xmm_v8, xmm_v9)
   209              PADDD(xmm_v12, xmm_v13)
   210              PXOR(xmm_v3, xmm_v0)
   211              PXOR(xmm_v7, xmm_v4)
   212              PXOR(xmm_v11, xmm_v8)
   213              PXOR(xmm_v15, xmm_v12)
   214  
   215              MOVDQA(mem_tmp0, xmm_tmp) # Save
   216  
   217              ROTW8_sse2(xmm_tmp, xmm_v3)
   218              ROTW8_sse2(xmm_tmp, xmm_v7)
   219              ROTW8_sse2(xmm_tmp, xmm_v11)
   220              ROTW8_sse2(xmm_tmp, xmm_v15)
   221  
   222              # c += d; b ^= c; b = ROTW7(b)
   223              PADDD(xmm_v2, xmm_v3)
   224              PADDD(xmm_v6, xmm_v7)
   225              PADDD(xmm_v10, xmm_v11)
   226              PADDD(xmm_v14, xmm_v15)
   227              PXOR(xmm_v1, xmm_v2)
   228              PXOR(xmm_v5, xmm_v6)
   229              PXOR(xmm_v9, xmm_v10)
   230              PXOR(xmm_v13, xmm_v14)
   231              ROTW7_sse2(xmm_tmp, xmm_v1)
   232              ROTW7_sse2(xmm_tmp, xmm_v5)
   233              ROTW7_sse2(xmm_tmp, xmm_v9)
   234              ROTW7_sse2(xmm_tmp, xmm_v13)
   235  
   236              # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
   237              PSHUFD(xmm_v1, xmm_v1, 0x39)
   238              PSHUFD(xmm_v5, xmm_v5, 0x39)
   239              PSHUFD(xmm_v9, xmm_v9, 0x39)
   240              PSHUFD(xmm_v13, xmm_v13, 0x39)
   241              PSHUFD(xmm_v2, xmm_v2, 0x4e)
   242              PSHUFD(xmm_v6, xmm_v6, 0x4e)
   243              PSHUFD(xmm_v10, xmm_v10, 0x4e)
   244              PSHUFD(xmm_v14, xmm_v14, 0x4e)
   245              PSHUFD(xmm_v3, xmm_v3, 0x93)
   246              PSHUFD(xmm_v7, xmm_v7, 0x93)
   247              PSHUFD(xmm_v11, xmm_v11, 0x93)
   248              PSHUFD(xmm_v15, xmm_v15, 0x93)
   249  
   250              MOVDQA(xmm_tmp, mem_tmp0) # Restore
   251  
   252              # a += b; d ^= a; d = ROTW16(d);
   253              PADDD(xmm_v0, xmm_v1)
   254              PADDD(xmm_v4, xmm_v5)
   255              PADDD(xmm_v8, xmm_v9)
   256              PADDD(xmm_v12, xmm_v13)
   257              PXOR(xmm_v3, xmm_v0)
   258              PXOR(xmm_v7, xmm_v4)
   259              PXOR(xmm_v11, xmm_v8)
   260              PXOR(xmm_v15, xmm_v12)
   261  
   262              MOVDQA(mem_tmp0, xmm_tmp) # Save
   263  
   264              ROTW16_sse2(xmm_tmp, xmm_v3)
   265              ROTW16_sse2(xmm_tmp, xmm_v7)
   266              ROTW16_sse2(xmm_tmp, xmm_v11)
   267              ROTW16_sse2(xmm_tmp, xmm_v15)
   268  
   269              # c += d; b ^= c; b = ROTW12(b);
   270              PADDD(xmm_v2, xmm_v3)
   271              PADDD(xmm_v6, xmm_v7)
   272              PADDD(xmm_v10, xmm_v11)
   273              PADDD(xmm_v14, xmm_v15)
   274              PXOR(xmm_v1, xmm_v2)
   275              PXOR(xmm_v5, xmm_v6)
   276              PXOR(xmm_v9, xmm_v10)
   277              PXOR(xmm_v13, xmm_v14)
   278              ROTW12_sse2(xmm_tmp, xmm_v1)
   279              ROTW12_sse2(xmm_tmp, xmm_v5)
   280              ROTW12_sse2(xmm_tmp, xmm_v9)
   281              ROTW12_sse2(xmm_tmp, xmm_v13)
   282  
   283              # a += b; d ^= a; d = ROTW8(d);
   284              MOVDQA(xmm_tmp, mem_tmp0) # Restore
   285  
   286              PADDD(xmm_v0, xmm_v1)
   287              PADDD(xmm_v4, xmm_v5)
   288              PADDD(xmm_v8, xmm_v9)
   289              PADDD(xmm_v12, xmm_v13)
   290              PXOR(xmm_v3, xmm_v0)
   291              PXOR(xmm_v7, xmm_v4)
   292              PXOR(xmm_v11, xmm_v8)
   293              PXOR(xmm_v15, xmm_v12)
   294  
   295              MOVDQA(mem_tmp0, xmm_tmp) # Save
   296  
   297              ROTW8_sse2(xmm_tmp, xmm_v3)
   298              ROTW8_sse2(xmm_tmp, xmm_v7)
   299              ROTW8_sse2(xmm_tmp, xmm_v11)
   300              ROTW8_sse2(xmm_tmp, xmm_v15)
   301  
   302              # c += d; b ^= c; b = ROTW7(b)
   303              PADDD(xmm_v2, xmm_v3)
   304              PADDD(xmm_v6, xmm_v7)
   305              PADDD(xmm_v10, xmm_v11)
   306              PADDD(xmm_v14, xmm_v15)
   307              PXOR(xmm_v1, xmm_v2)
   308              PXOR(xmm_v5, xmm_v6)
   309              PXOR(xmm_v9, xmm_v10)
   310              PXOR(xmm_v13, xmm_v14)
   311              ROTW7_sse2(xmm_tmp, xmm_v1)
   312              ROTW7_sse2(xmm_tmp, xmm_v5)
   313              ROTW7_sse2(xmm_tmp, xmm_v9)
   314              ROTW7_sse2(xmm_tmp, xmm_v13)
   315  
   316              # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
   317              PSHUFD(xmm_v1, xmm_v1, 0x93)
   318              PSHUFD(xmm_v5, xmm_v5, 0x93)
   319              PSHUFD(xmm_v9, xmm_v9, 0x93)
   320              PSHUFD(xmm_v13, xmm_v13, 0x93)
   321              PSHUFD(xmm_v2, xmm_v2, 0x4e)
   322              PSHUFD(xmm_v6, xmm_v6, 0x4e)
   323              PSHUFD(xmm_v10, xmm_v10, 0x4e)
   324              PSHUFD(xmm_v14, xmm_v14, 0x4e)
   325              PSHUFD(xmm_v3, xmm_v3, 0x39)
   326              PSHUFD(xmm_v7, xmm_v7, 0x39)
   327              PSHUFD(xmm_v11, xmm_v11, 0x39)
   328              PSHUFD(xmm_v15, xmm_v15, 0x39)
   329  
   330              MOVDQA(xmm_tmp, mem_tmp0) # Restore
   331  
   332              SUB(reg_rounds, 2)
   333              JNZ(rounds_loop4.begin)
   334  
   335          MOVDQA(mem_tmp0, xmm_tmp)
   336  
   337          PADDD(xmm_v0, mem_s0)
   338          PADDD(xmm_v1, mem_s1)
   339          PADDD(xmm_v2, mem_s2)
   340          PADDD(xmm_v3, mem_s3)
   341          WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
   342          MOVDQU(xmm_v3, mem_s3)
   343          PADDQ(xmm_v3, mem_one)
   344  
   345          PADDD(xmm_v4, mem_s0)
   346          PADDD(xmm_v5, mem_s1)
   347          PADDD(xmm_v6, mem_s2)
   348          PADDD(xmm_v7, xmm_v3)
   349          WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
   350          PADDQ(xmm_v3, mem_one)
   351  
   352          PADDD(xmm_v8, mem_s0)
   353          PADDD(xmm_v9, mem_s1)
   354          PADDD(xmm_v10, mem_s2)
   355          PADDD(xmm_v11, xmm_v3)
   356          WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
   357          PADDQ(xmm_v3, mem_one)
   358  
   359          MOVDQA(xmm_tmp, mem_tmp0)
   360  
   361          PADDD(xmm_v12, mem_s0)
   362          PADDD(xmm_v13, mem_s1)
   363          PADDD(xmm_v14, mem_s2)
   364          PADDD(xmm_v15, xmm_v3)
   365          WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15)
   366          PADDQ(xmm_v3, mem_one)
   367  
   368          MOVDQU(mem_s3, xmm_v3)
   369  
   370          ADD(reg_inp, 4 * 64)
   371          ADD(reg_outp, 4 * 64)
   372  
   373          SUB(reg_blocks, 4)
   374          JAE(vector_loop4.begin)
   375  
   376      ADD(reg_blocks, 4)
   377      out = Label()
   378      JZ(out)
   379  
   380      # Past this point, we no longer need to use every single register to hold
   381      # the in progress state.
   382  
   383      xmm_s0 = xmm_v8
   384      xmm_s1 = xmm_v9
   385      xmm_s2 = xmm_v10
   386      xmm_s3 = xmm_v11
   387      xmm_one = xmm_v13
   388      MOVDQU(xmm_s0, mem_s0)
   389      MOVDQU(xmm_s1, mem_s1)
   390      MOVDQU(xmm_s2, mem_s2)
   391      MOVDQU(xmm_s3, mem_s3)
   392      MOVDQA(xmm_one, mem_one)
   393  
   394      #
   395      # 2 blocks at a time.
   396      #
   397  
   398      process_1_block = Label()
   399      SUB(reg_blocks, 2)
   400      JB(process_1_block) # < 2 blocks remaining.
   401  
   402      MOVDQA(xmm_v0, xmm_s0)
   403      MOVDQA(xmm_v1, xmm_s1)
   404      MOVDQA(xmm_v2, xmm_s2)
   405      MOVDQA(xmm_v3, xmm_s3)
   406  
   407      MOVDQA(xmm_v4, xmm_v0)
   408      MOVDQA(xmm_v5, xmm_v1)
   409      MOVDQA(xmm_v6, xmm_v2)
   410      MOVDQA(xmm_v7, xmm_v3)
   411      PADDQ(xmm_v7, xmm_one)
   412  
   413      MOV(reg_rounds, 20)
   414      rounds_loop2 = Loop()
   415      with rounds_loop2:
   416          # a += b; d ^= a; d = ROTW16(d);
   417          PADDD(xmm_v0, xmm_v1)
   418          PADDD(xmm_v4, xmm_v5)
   419          PXOR(xmm_v3, xmm_v0)
   420          PXOR(xmm_v7, xmm_v4)
   421          ROTW16_sse2(xmm_tmp, xmm_v3)
   422          ROTW16_sse2(xmm_tmp, xmm_v7)
   423  
   424          # c += d; b ^= c; b = ROTW12(b);
   425          PADDD(xmm_v2, xmm_v3)
   426          PADDD(xmm_v6, xmm_v7)
   427          PXOR(xmm_v1, xmm_v2)
   428          PXOR(xmm_v5, xmm_v6)
   429          ROTW12_sse2(xmm_tmp, xmm_v1)
   430          ROTW12_sse2(xmm_tmp, xmm_v5)
   431  
   432          # a += b; d ^= a; d = ROTW8(d);
   433          PADDD(xmm_v0, xmm_v1)
   434          PADDD(xmm_v4, xmm_v5)
   435          PXOR(xmm_v3, xmm_v0)
   436          PXOR(xmm_v7, xmm_v4)
   437          ROTW8_sse2(xmm_tmp, xmm_v3)
   438          ROTW8_sse2(xmm_tmp, xmm_v7)
   439  
   440          # c += d; b ^= c; b = ROTW7(b)
   441          PADDD(xmm_v2, xmm_v3)
   442          PADDD(xmm_v6, xmm_v7)
   443          PXOR(xmm_v1, xmm_v2)
   444          PXOR(xmm_v5, xmm_v6)
   445          ROTW7_sse2(xmm_tmp, xmm_v1)
   446          ROTW7_sse2(xmm_tmp, xmm_v5)
   447  
   448          # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
   449          PSHUFD(xmm_v1, xmm_v1, 0x39)
   450          PSHUFD(xmm_v5, xmm_v5, 0x39)
   451          PSHUFD(xmm_v2, xmm_v2, 0x4e)
   452          PSHUFD(xmm_v6, xmm_v6, 0x4e)
   453          PSHUFD(xmm_v3, xmm_v3, 0x93)
   454          PSHUFD(xmm_v7, xmm_v7, 0x93)
   455  
   456          # a += b; d ^= a; d = ROTW16(d);
   457          PADDD(xmm_v0, xmm_v1)
   458          PADDD(xmm_v4, xmm_v5)
   459          PXOR(xmm_v3, xmm_v0)
   460          PXOR(xmm_v7, xmm_v4)
   461          ROTW16_sse2(xmm_tmp, xmm_v3)
   462          ROTW16_sse2(xmm_tmp, xmm_v7)
   463  
   464          # c += d; b ^= c; b = ROTW12(b);
   465          PADDD(xmm_v2, xmm_v3)
   466          PADDD(xmm_v6, xmm_v7)
   467          PXOR(xmm_v1, xmm_v2)
   468          PXOR(xmm_v5, xmm_v6)
   469          ROTW12_sse2(xmm_tmp, xmm_v1)
   470          ROTW12_sse2(xmm_tmp, xmm_v5)
   471  
   472          # a += b; d ^= a; d = ROTW8(d);
   473          PADDD(xmm_v0, xmm_v1)
   474          PADDD(xmm_v4, xmm_v5)
   475          PXOR(xmm_v3, xmm_v0)
   476          PXOR(xmm_v7, xmm_v4)
   477          ROTW8_sse2(xmm_tmp, xmm_v3)
   478          ROTW8_sse2(xmm_tmp, xmm_v7)
   479  
   480          # c += d; b ^= c; b = ROTW7(b)
   481          PADDD(xmm_v2, xmm_v3)
   482          PADDD(xmm_v6, xmm_v7)
   483          PXOR(xmm_v1, xmm_v2)
   484          PXOR(xmm_v5, xmm_v6)
   485          ROTW7_sse2(xmm_tmp, xmm_v1)
   486          ROTW7_sse2(xmm_tmp, xmm_v5)
   487  
   488          # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
   489          PSHUFD(xmm_v1, xmm_v1, 0x93)
   490          PSHUFD(xmm_v5, xmm_v5, 0x93)
   491          PSHUFD(xmm_v2, xmm_v2, 0x4e)
   492          PSHUFD(xmm_v6, xmm_v6, 0x4e)
   493          PSHUFD(xmm_v3, xmm_v3, 0x39)
   494          PSHUFD(xmm_v7, xmm_v7, 0x39)
   495  
   496          SUB(reg_rounds, 2)
   497          JNZ(rounds_loop2.begin)
   498  
   499      PADDD(xmm_v0, xmm_s0)
   500      PADDD(xmm_v1, xmm_s1)
   501      PADDD(xmm_v2, xmm_s2)
   502      PADDD(xmm_v3, xmm_s3)
   503      WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
   504      PADDQ(xmm_s3, xmm_one)
   505  
   506      PADDD(xmm_v4, xmm_s0)
   507      PADDD(xmm_v5, xmm_s1)
   508      PADDD(xmm_v6, xmm_s2)
   509      PADDD(xmm_v7, xmm_s3)
   510      WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
   511      PADDQ(xmm_s3, xmm_one)
   512  
   513      ADD(reg_inp, 2 * 64)
   514      ADD(reg_outp, 2 * 64)
   515      SUB(reg_blocks, 2)
   516  
   517      LABEL(process_1_block)
   518      ADD(reg_blocks, 2)
   519      out_serial = Label()
   520      JZ(out_serial)
   521  
   522      #
   523      # 1 block at a time.  Only executed once, because if there was > 1,
   524      # the parallel code would have processed it already.
   525      #
   526  
   527      MOVDQA(xmm_v0, xmm_s0)
   528      MOVDQA(xmm_v1, xmm_s1)
   529      MOVDQA(xmm_v2, xmm_s2)
   530      MOVDQA(xmm_v3, xmm_s3)
   531  
   532      MOV(reg_rounds, 20)
   533      rounds_loop1 = Loop()
   534      with rounds_loop1:
   535          # a += b; d ^= a; d = ROTW16(d);
   536          PADDD(xmm_v0, xmm_v1)
   537          PXOR(xmm_v3, xmm_v0)
   538          ROTW16_sse2(xmm_tmp, xmm_v3)
   539  
   540          # c += d; b ^= c; b = ROTW12(b);
   541          PADDD(xmm_v2, xmm_v3)
   542          PXOR(xmm_v1, xmm_v2)
   543          ROTW12_sse2(xmm_tmp, xmm_v1)
   544  
   545          # a += b; d ^= a; d = ROTW8(d);
   546          PADDD(xmm_v0, xmm_v1)
   547          PXOR(xmm_v3, xmm_v0)
   548          ROTW8_sse2(xmm_tmp, xmm_v3)
   549  
   550          # c += d; b ^= c; b = ROTW7(b)
   551          PADDD(xmm_v2, xmm_v3)
   552          PXOR(xmm_v1, xmm_v2)
   553          ROTW7_sse2(xmm_tmp, xmm_v1)
   554  
   555          # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
   556          PSHUFD(xmm_v1, xmm_v1, 0x39)
   557          PSHUFD(xmm_v2, xmm_v2, 0x4e)
   558          PSHUFD(xmm_v3, xmm_v3, 0x93)
   559  
   560          # a += b; d ^= a; d = ROTW16(d);
   561          PADDD(xmm_v0, xmm_v1)
   562          PXOR(xmm_v3, xmm_v0)
   563          ROTW16_sse2(xmm_tmp, xmm_v3)
   564  
   565          # c += d; b ^= c; b = ROTW12(b);
   566          PADDD(xmm_v2, xmm_v3)
   567          PXOR(xmm_v1, xmm_v2)
   568          ROTW12_sse2(xmm_tmp, xmm_v1)
   569  
   570          # a += b; d ^= a; d = ROTW8(d);
   571          PADDD(xmm_v0, xmm_v1)
   572          PXOR(xmm_v3, xmm_v0)
   573          ROTW8_sse2(xmm_tmp, xmm_v3)
   574  
   575          # c += d; b ^= c; b = ROTW7(b)
   576          PADDD(xmm_v2, xmm_v3)
   577          PXOR(xmm_v1, xmm_v2)
   578          ROTW7_sse2(xmm_tmp, xmm_v1)
   579  
   580          # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
   581          PSHUFD(xmm_v1, xmm_v1, 0x93)
   582          PSHUFD(xmm_v2, xmm_v2, 0x4e)
   583          PSHUFD(xmm_v3, xmm_v3, 0x39)
   584  
   585          SUB(reg_rounds, 2)
   586          JNZ(rounds_loop1.begin)
   587  
   588      PADDD(xmm_v0, xmm_s0)
   589      PADDD(xmm_v1, xmm_s1)
   590      PADDD(xmm_v2, xmm_s2)
   591      PADDD(xmm_v3, xmm_s3)
   592      WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
   593      PADDQ(xmm_s3, xmm_one)
   594  
   595      LABEL(out_serial)
   596  
   597      # Write back the updated counter.  Stoping at 2^70 bytes is the user's
   598      # problem, not mine.  (Skipped if there's exactly a multiple of 4 blocks
   599      # because the counter is incremented in memory while looping.)
   600      MOVDQU(mem_s3, xmm_s3)
   601  
   602      LABEL(out)
   603  
   604      # Paranoia, cleanse the scratch space.
   605      PXOR(xmm_v0, xmm_v0)
   606      MOVDQA(mem_tmp0, xmm_v0)
   607  
   608      # Remove our stack allocation.
   609      MOV(registers.rsp, reg_sp_save)
   610  
   611      RETURN()
   612  
   613  #
   614  # AVX2 helpers.  Like the SSE2 equivalents, the scratch register is explicit,
   615  # and more helpers are used to increase readability for destructive operations.
   616  #
   617  # XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB.
   618  #
   619  
   620  def ADD_avx2(dst, src):
   621      VPADDD(dst, dst, src)
   622  
   623  def XOR_avx2(dst, src):
   624      VPXOR(dst, dst, src)
   625  
   626  def ROTW16_avx2(tmp, d):
   627      VPSLLD(tmp, d, 16)
   628      VPSRLD(d, d, 16)
   629      XOR_avx2(d, tmp)
   630  
   631  def ROTW12_avx2(tmp, b):
   632      VPSLLD(tmp, b, 12)
   633      VPSRLD(b, b, 20)
   634      XOR_avx2(b, tmp)
   635  
   636  def ROTW8_avx2(tmp, d):
   637      VPSLLD(tmp, d, 8)
   638      VPSRLD(d, d, 24)
   639      XOR_avx2(d, tmp)
   640  
   641  def ROTW7_avx2(tmp, b):
   642      VPSLLD(tmp, b, 7)
   643      VPSRLD(b, b, 25)
   644      XOR_avx2(b, tmp)
   645  
   646  def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3):
   647      # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
   648      VPERM2I128(tmp, v0, v1, 0x20)
   649      VPXOR(tmp, tmp, [inp+d])
   650      VMOVDQU([outp+d], tmp)
   651  
   652      # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
   653      VPERM2I128(tmp, v2, v3, 0x20)
   654      VPXOR(tmp, tmp, [inp+d+32])
   655      VMOVDQU([outp+d+32], tmp)
   656  
   657      # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
   658      VPERM2I128(tmp, v0, v1, 0x31)
   659      VPXOR(tmp, tmp, [inp+d+64])
   660      VMOVDQU([outp+d+64], tmp)
   661  
   662      # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
   663      VPERM2I128(tmp, v2, v3, 0x31)
   664      VPXOR(tmp, tmp, [inp+d+96])
   665      VMOVDQU([outp+d+96], tmp)
   666  
   667  # AVX2 ChaCha20 (aka avx2).  Does not handle partial blocks, will process
   668  # 8/4/2 blocks at a time.
   669  with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell):
   670      reg_x = GeneralPurposeRegister64()
   671      reg_inp = GeneralPurposeRegister64()
   672      reg_outp = GeneralPurposeRegister64()
   673      reg_blocks = GeneralPurposeRegister64()
   674      reg_sp_save = GeneralPurposeRegister64()
   675  
   676      LOAD.ARGUMENT(reg_x, x)
   677      LOAD.ARGUMENT(reg_inp, inp)
   678      LOAD.ARGUMENT(reg_outp, outp)
   679      LOAD.ARGUMENT(reg_blocks, nrBlocks)
   680  
   681      # Align the stack to a 32 byte boundary.
   682      MOV(reg_sp_save, registers.rsp)
   683      AND(registers.rsp, 0xffffffffffffffe0)
   684      SUB(registers.rsp, 0x20)
   685  
   686      x_s0 = [reg_x]           # (Memory) Cipher state [0..3]
   687      x_s1 = [reg_x+16]        # (Memory) Cipher state [4..7]
   688      x_s2 = [reg_x+32]        # (Memory) Cipher state [8..11]
   689      x_s3 = [reg_x+48]        # (Memory) Cipher state [12..15]
   690  
   691      ymm_v0 = YMMRegister()
   692      ymm_v1 = YMMRegister()
   693      ymm_v2 = YMMRegister()
   694      ymm_v3 = YMMRegister()
   695  
   696      ymm_v4 = YMMRegister()
   697      ymm_v5 = YMMRegister()
   698      ymm_v6 = YMMRegister()
   699      ymm_v7 = YMMRegister()
   700  
   701      ymm_v8 = YMMRegister()
   702      ymm_v9 = YMMRegister()
   703      ymm_v10 = YMMRegister()
   704      ymm_v11 = YMMRegister()
   705  
   706      ymm_v12 = YMMRegister()
   707      ymm_v13 = YMMRegister()
   708      ymm_v14 = YMMRegister()
   709      ymm_v15 = YMMRegister()
   710  
   711      ymm_tmp0 = ymm_v12
   712  
   713      # Allocate the neccecary stack space for the counter vector and two ymm
   714      # registers that we will spill.
   715      SUB(registers.rsp, 96)
   716      mem_tmp0 = [registers.rsp+64]  # (Stack) Scratch space.
   717      mem_s3 = [registers.rsp+32]    # (Stack) Working copy of s3. (8x)
   718      mem_inc = [registers.rsp]      # (Stack) Counter increment vector.
   719  
   720      # Increment the counter for one side of the state vector.
   721      VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0)
   722      VMOVDQU(mem_inc, ymm_tmp0)
   723      reg_tmp = GeneralPurposeRegister32()
   724      MOV(reg_tmp, 0x00000001)
   725      MOV([registers.rsp+16], reg_tmp)
   726      VBROADCASTI128(ymm_v3, x_s3)
   727      VPADDQ(ymm_v3, ymm_v3, [registers.rsp])
   728      VMOVDQA(mem_s3, ymm_v3)
   729  
   730      # As we process 2xN blocks at a time, so the counter increment for both
   731      # sides of the state vector is 2.
   732      MOV(reg_tmp, 0x00000002)
   733      MOV([registers.rsp], reg_tmp)
   734      MOV([registers.rsp+16], reg_tmp)
   735  
   736      out_write_even = Label()
   737      out_write_odd = Label()
   738  
   739      #
   740      # 8 blocks at a time.  Ted Krovetz's avx2 code does not do this, but it's
   741      # a decent gain despite all the pain...
   742      #
   743  
   744      reg_rounds = GeneralPurposeRegister64()
   745  
   746      vector_loop8 = Loop()
   747      SUB(reg_blocks, 8)
   748      JB(vector_loop8.end)
   749      with vector_loop8:
   750          VBROADCASTI128(ymm_v0, x_s0)
   751          VBROADCASTI128(ymm_v1, x_s1)
   752          VBROADCASTI128(ymm_v2, x_s2)
   753          VMOVDQA(ymm_v3, mem_s3)
   754  
   755          VMOVDQA(ymm_v4, ymm_v0)
   756          VMOVDQA(ymm_v5, ymm_v1)
   757          VMOVDQA(ymm_v6, ymm_v2)
   758          VPADDQ(ymm_v7, ymm_v3, mem_inc)
   759  
   760          VMOVDQA(ymm_v8, ymm_v0)
   761          VMOVDQA(ymm_v9, ymm_v1)
   762          VMOVDQA(ymm_v10, ymm_v2)
   763          VPADDQ(ymm_v11, ymm_v7, mem_inc)
   764  
   765          VMOVDQA(ymm_v12, ymm_v0)
   766          VMOVDQA(ymm_v13, ymm_v1)
   767          VMOVDQA(ymm_v14, ymm_v2)
   768          VPADDQ(ymm_v15, ymm_v11, mem_inc)
   769  
   770          MOV(reg_rounds, 20)
   771          rounds_loop8 = Loop()
   772          with rounds_loop8:
   773              # a += b; d ^= a; d = ROTW16(d);
   774              ADD_avx2(ymm_v0, ymm_v1)
   775              ADD_avx2(ymm_v4, ymm_v5)
   776              ADD_avx2(ymm_v8, ymm_v9)
   777              ADD_avx2(ymm_v12, ymm_v13)
   778              XOR_avx2(ymm_v3, ymm_v0)
   779              XOR_avx2(ymm_v7, ymm_v4)
   780              XOR_avx2(ymm_v11, ymm_v8)
   781              XOR_avx2(ymm_v15, ymm_v12)
   782  
   783              VMOVDQA(mem_tmp0, ymm_tmp0) # Save
   784  
   785              ROTW16_avx2(ymm_tmp0, ymm_v3)
   786              ROTW16_avx2(ymm_tmp0, ymm_v7)
   787              ROTW16_avx2(ymm_tmp0, ymm_v11)
   788              ROTW16_avx2(ymm_tmp0, ymm_v15)
   789  
   790              # c += d; b ^= c; b = ROTW12(b);
   791              ADD_avx2(ymm_v2, ymm_v3)
   792              ADD_avx2(ymm_v6, ymm_v7)
   793              ADD_avx2(ymm_v10, ymm_v11)
   794              ADD_avx2(ymm_v14, ymm_v15)
   795              XOR_avx2(ymm_v1, ymm_v2)
   796              XOR_avx2(ymm_v5, ymm_v6)
   797              XOR_avx2(ymm_v9, ymm_v10)
   798              XOR_avx2(ymm_v13, ymm_v14)
   799              ROTW12_avx2(ymm_tmp0, ymm_v1)
   800              ROTW12_avx2(ymm_tmp0, ymm_v5)
   801              ROTW12_avx2(ymm_tmp0, ymm_v9)
   802              ROTW12_avx2(ymm_tmp0, ymm_v13)
   803  
   804              # a += b; d ^= a; d = ROTW8(d);
   805              VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
   806  
   807              ADD_avx2(ymm_v0, ymm_v1)
   808              ADD_avx2(ymm_v4, ymm_v5)
   809              ADD_avx2(ymm_v8, ymm_v9)
   810              ADD_avx2(ymm_v12, ymm_v13)
   811              XOR_avx2(ymm_v3, ymm_v0)
   812              XOR_avx2(ymm_v7, ymm_v4)
   813              XOR_avx2(ymm_v11, ymm_v8)
   814              XOR_avx2(ymm_v15, ymm_v12)
   815  
   816              VMOVDQA(mem_tmp0, ymm_tmp0) # Save
   817  
   818              ROTW8_avx2(ymm_tmp0, ymm_v3)
   819              ROTW8_avx2(ymm_tmp0, ymm_v7)
   820              ROTW8_avx2(ymm_tmp0, ymm_v11)
   821              ROTW8_avx2(ymm_tmp0, ymm_v15)
   822  
   823              # c += d; b ^= c; b = ROTW7(b)
   824              ADD_avx2(ymm_v2, ymm_v3)
   825              ADD_avx2(ymm_v6, ymm_v7)
   826              ADD_avx2(ymm_v10, ymm_v11)
   827              ADD_avx2(ymm_v14, ymm_v15)
   828              XOR_avx2(ymm_v1, ymm_v2)
   829              XOR_avx2(ymm_v5, ymm_v6)
   830              XOR_avx2(ymm_v9, ymm_v10)
   831              XOR_avx2(ymm_v13, ymm_v14)
   832              ROTW7_avx2(ymm_tmp0, ymm_v1)
   833              ROTW7_avx2(ymm_tmp0, ymm_v5)
   834              ROTW7_avx2(ymm_tmp0, ymm_v9)
   835              ROTW7_avx2(ymm_tmp0, ymm_v13)
   836  
   837              # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
   838              VPSHUFD(ymm_v1, ymm_v1, 0x39)
   839              VPSHUFD(ymm_v5, ymm_v5, 0x39)
   840              VPSHUFD(ymm_v9, ymm_v9, 0x39)
   841              VPSHUFD(ymm_v13, ymm_v13, 0x39)
   842              VPSHUFD(ymm_v2, ymm_v2, 0x4e)
   843              VPSHUFD(ymm_v6, ymm_v6, 0x4e)
   844              VPSHUFD(ymm_v10, ymm_v10, 0x4e)
   845              VPSHUFD(ymm_v14, ymm_v14, 0x4e)
   846              VPSHUFD(ymm_v3, ymm_v3, 0x93)
   847              VPSHUFD(ymm_v7, ymm_v7, 0x93)
   848              VPSHUFD(ymm_v11, ymm_v11, 0x93)
   849              VPSHUFD(ymm_v15, ymm_v15, 0x93)
   850  
   851              # a += b; d ^= a; d = ROTW16(d);
   852              VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
   853  
   854              ADD_avx2(ymm_v0, ymm_v1)
   855              ADD_avx2(ymm_v4, ymm_v5)
   856              ADD_avx2(ymm_v8, ymm_v9)
   857              ADD_avx2(ymm_v12, ymm_v13)
   858              XOR_avx2(ymm_v3, ymm_v0)
   859              XOR_avx2(ymm_v7, ymm_v4)
   860              XOR_avx2(ymm_v11, ymm_v8)
   861              XOR_avx2(ymm_v15, ymm_v12)
   862  
   863              VMOVDQA(mem_tmp0, ymm_tmp0) # Save
   864  
   865              ROTW16_avx2(ymm_tmp0, ymm_v3)
   866              ROTW16_avx2(ymm_tmp0, ymm_v7)
   867              ROTW16_avx2(ymm_tmp0, ymm_v11)
   868              ROTW16_avx2(ymm_tmp0, ymm_v15)
   869  
   870              # c += d; b ^= c; b = ROTW12(b);
   871              ADD_avx2(ymm_v2, ymm_v3)
   872              ADD_avx2(ymm_v6, ymm_v7)
   873              ADD_avx2(ymm_v10, ymm_v11)
   874              ADD_avx2(ymm_v14, ymm_v15)
   875              XOR_avx2(ymm_v1, ymm_v2)
   876              XOR_avx2(ymm_v5, ymm_v6)
   877              XOR_avx2(ymm_v9, ymm_v10)
   878              XOR_avx2(ymm_v13, ymm_v14)
   879              ROTW12_avx2(ymm_tmp0, ymm_v1)
   880              ROTW12_avx2(ymm_tmp0, ymm_v5)
   881              ROTW12_avx2(ymm_tmp0, ymm_v9)
   882              ROTW12_avx2(ymm_tmp0, ymm_v13)
   883  
   884              # a += b; d ^= a; d = ROTW8(d);
   885              VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
   886  
   887              ADD_avx2(ymm_v0, ymm_v1)
   888              ADD_avx2(ymm_v4, ymm_v5)
   889              ADD_avx2(ymm_v8, ymm_v9)
   890              ADD_avx2(ymm_v12, ymm_v13)
   891              XOR_avx2(ymm_v3, ymm_v0)
   892              XOR_avx2(ymm_v7, ymm_v4)
   893              XOR_avx2(ymm_v11, ymm_v8)
   894              XOR_avx2(ymm_v15, ymm_v12)
   895  
   896              VMOVDQA(mem_tmp0, ymm_tmp0) # Save
   897  
   898              ROTW8_avx2(ymm_tmp0, ymm_v3)
   899              ROTW8_avx2(ymm_tmp0, ymm_v7)
   900              ROTW8_avx2(ymm_tmp0, ymm_v11)
   901              ROTW8_avx2(ymm_tmp0, ymm_v15)
   902  
   903              # c += d; b ^= c; b = ROTW7(b)
   904              ADD_avx2(ymm_v2, ymm_v3)
   905              ADD_avx2(ymm_v6, ymm_v7)
   906              ADD_avx2(ymm_v10, ymm_v11)
   907              ADD_avx2(ymm_v14, ymm_v15)
   908              XOR_avx2(ymm_v1, ymm_v2)
   909              XOR_avx2(ymm_v5, ymm_v6)
   910              XOR_avx2(ymm_v9, ymm_v10)
   911              XOR_avx2(ymm_v13, ymm_v14)
   912              ROTW7_avx2(ymm_tmp0, ymm_v1)
   913              ROTW7_avx2(ymm_tmp0, ymm_v5)
   914              ROTW7_avx2(ymm_tmp0, ymm_v9)
   915              ROTW7_avx2(ymm_tmp0, ymm_v13)
   916  
   917              # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
   918              VPSHUFD(ymm_v1, ymm_v1, 0x93)
   919              VPSHUFD(ymm_v5, ymm_v5, 0x93)
   920              VPSHUFD(ymm_v9, ymm_v9, 0x93)
   921              VPSHUFD(ymm_v13, ymm_v13, 0x93)
   922              VPSHUFD(ymm_v2, ymm_v2, 0x4e)
   923              VPSHUFD(ymm_v6, ymm_v6, 0x4e)
   924              VPSHUFD(ymm_v10, ymm_v10, 0x4e)
   925              VPSHUFD(ymm_v14, ymm_v14, 0x4e)
   926              VPSHUFD(ymm_v3, ymm_v3, 0x39)
   927              VPSHUFD(ymm_v7, ymm_v7, 0x39)
   928              VPSHUFD(ymm_v11, ymm_v11, 0x39)
   929              VPSHUFD(ymm_v15, ymm_v15, 0x39)
   930  
   931              VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
   932  
   933              SUB(reg_rounds, 2)
   934              JNZ(rounds_loop8.begin)
   935  
   936          # ymm_v12 is in mem_tmp0 and is current....
   937  
   938          # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA....
   939          VBROADCASTI128(ymm_tmp0, x_s0)
   940          ADD_avx2(ymm_v0, ymm_tmp0)
   941          ADD_avx2(ymm_v4, ymm_tmp0)
   942          ADD_avx2(ymm_v8, ymm_tmp0)
   943          ADD_avx2(ymm_tmp0, mem_tmp0)
   944          VMOVDQA(mem_tmp0, ymm_tmp0)
   945  
   946          VBROADCASTI128(ymm_tmp0, x_s1)
   947          ADD_avx2(ymm_v1, ymm_tmp0)
   948          ADD_avx2(ymm_v5, ymm_tmp0)
   949          ADD_avx2(ymm_v9, ymm_tmp0)
   950          ADD_avx2(ymm_v13, ymm_tmp0)
   951  
   952          VBROADCASTI128(ymm_tmp0, x_s2)
   953          ADD_avx2(ymm_v2, ymm_tmp0)
   954          ADD_avx2(ymm_v6, ymm_tmp0)
   955          ADD_avx2(ymm_v10, ymm_tmp0)
   956          ADD_avx2(ymm_v14, ymm_tmp0)
   957  
   958          ADD_avx2(ymm_v3, mem_s3)
   959          WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
   960          VMOVDQA(ymm_v3, mem_s3)
   961          ADD_avx2(ymm_v3, mem_inc)
   962  
   963          ADD_avx2(ymm_v7, ymm_v3)
   964          WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
   965          ADD_avx2(ymm_v3, mem_inc)
   966  
   967          ADD_avx2(ymm_v11, ymm_v3)
   968          WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11)
   969          ADD_avx2(ymm_v3, mem_inc)
   970  
   971          VMOVDQA(ymm_v12, mem_tmp0)
   972          ADD_avx2(ymm_v15, ymm_v3)
   973          WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15)
   974          ADD_avx2(ymm_v3, mem_inc)
   975  
   976          VMOVDQA(mem_s3, ymm_v3)
   977  
   978          ADD(reg_inp, 8 * 64)
   979          ADD(reg_outp, 8 * 64)
   980  
   981          SUB(reg_blocks, 8)
   982          JAE(vector_loop8.begin)
   983  
   984      # ymm_v3 contains a current copy of mem_s3 either from when it was built,
   985      # or because the loop updates it.  Copy this before we mess with the block
   986      # counter in case we need to write it back and return.
   987      ymm_s3 = ymm_v11
   988      VMOVDQA(ymm_s3, ymm_v3)
   989  
   990      ADD(reg_blocks, 8)
   991      JZ(out_write_even)
   992  
   993      # We now actually can do everything in registers.
   994      ymm_s0 = ymm_v8
   995      VBROADCASTI128(ymm_s0, x_s0)
   996      ymm_s1 = ymm_v9
   997      VBROADCASTI128(ymm_s1, x_s1)
   998      ymm_s2 = ymm_v10
   999      VBROADCASTI128(ymm_s2, x_s2)
  1000      ymm_inc = ymm_v14
  1001      VMOVDQA(ymm_inc, mem_inc)
  1002  
  1003      #
  1004      # 4 blocks at a time.
  1005      #
  1006  
  1007      process_2_blocks = Label()
  1008      SUB(reg_blocks, 4)
  1009      JB(process_2_blocks) # < 4 blocks remaining.
  1010  
  1011      VMOVDQA(ymm_v0, ymm_s0)
  1012      VMOVDQA(ymm_v1, ymm_s1)
  1013      VMOVDQA(ymm_v2, ymm_s2)
  1014      VMOVDQA(ymm_v3, ymm_s3)
  1015  
  1016      VMOVDQA(ymm_v4, ymm_v0)
  1017      VMOVDQA(ymm_v5, ymm_v1)
  1018      VMOVDQA(ymm_v6, ymm_v2)
  1019      VPADDQ(ymm_v7, ymm_v3, ymm_inc)
  1020  
  1021      MOV(reg_rounds, 20)
  1022      rounds_loop4 = Loop()
  1023      with rounds_loop4:
  1024          # a += b; d ^= a; d = ROTW16(d);
  1025          ADD_avx2(ymm_v0, ymm_v1)
  1026          ADD_avx2(ymm_v4, ymm_v5)
  1027          XOR_avx2(ymm_v3, ymm_v0)
  1028          XOR_avx2(ymm_v7, ymm_v4)
  1029          ROTW16_avx2(ymm_tmp0, ymm_v3)
  1030          ROTW16_avx2(ymm_tmp0, ymm_v7)
  1031  
  1032          # c += d; b ^= c; b = ROTW12(b);
  1033          ADD_avx2(ymm_v2, ymm_v3)
  1034          ADD_avx2(ymm_v6, ymm_v7)
  1035          XOR_avx2(ymm_v1, ymm_v2)
  1036          XOR_avx2(ymm_v5, ymm_v6)
  1037          ROTW12_avx2(ymm_tmp0, ymm_v1)
  1038          ROTW12_avx2(ymm_tmp0, ymm_v5)
  1039  
  1040          # a += b; d ^= a; d = ROTW8(d);
  1041          ADD_avx2(ymm_v0, ymm_v1)
  1042          ADD_avx2(ymm_v4, ymm_v5)
  1043          XOR_avx2(ymm_v3, ymm_v0)
  1044          XOR_avx2(ymm_v7, ymm_v4)
  1045          ROTW8_avx2(ymm_tmp0, ymm_v3)
  1046          ROTW8_avx2(ymm_tmp0, ymm_v7)
  1047  
  1048          # c += d; b ^= c; b = ROTW7(b)
  1049          ADD_avx2(ymm_v2, ymm_v3)
  1050          ADD_avx2(ymm_v6, ymm_v7)
  1051          XOR_avx2(ymm_v1, ymm_v2)
  1052          XOR_avx2(ymm_v5, ymm_v6)
  1053          ROTW7_avx2(ymm_tmp0, ymm_v1)
  1054          ROTW7_avx2(ymm_tmp0, ymm_v5)
  1055  
  1056          # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
  1057          VPSHUFD(ymm_v1, ymm_v1, 0x39)
  1058          VPSHUFD(ymm_v5, ymm_v5, 0x39)
  1059          VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  1060          VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  1061          VPSHUFD(ymm_v3, ymm_v3, 0x93)
  1062          VPSHUFD(ymm_v7, ymm_v7, 0x93)
  1063  
  1064          # a += b; d ^= a; d = ROTW16(d);
  1065          ADD_avx2(ymm_v0, ymm_v1)
  1066          ADD_avx2(ymm_v4, ymm_v5)
  1067          XOR_avx2(ymm_v3, ymm_v0)
  1068          XOR_avx2(ymm_v7, ymm_v4)
  1069          ROTW16_avx2(ymm_tmp0, ymm_v3)
  1070          ROTW16_avx2(ymm_tmp0, ymm_v7)
  1071  
  1072          # c += d; b ^= c; b = ROTW12(b);
  1073          ADD_avx2(ymm_v2, ymm_v3)
  1074          ADD_avx2(ymm_v6, ymm_v7)
  1075          XOR_avx2(ymm_v1, ymm_v2)
  1076          XOR_avx2(ymm_v5, ymm_v6)
  1077          ROTW12_avx2(ymm_tmp0, ymm_v1)
  1078          ROTW12_avx2(ymm_tmp0, ymm_v5)
  1079  
  1080          # a += b; d ^= a; d = ROTW8(d);
  1081          ADD_avx2(ymm_v0, ymm_v1)
  1082          ADD_avx2(ymm_v4, ymm_v5)
  1083          XOR_avx2(ymm_v3, ymm_v0)
  1084          XOR_avx2(ymm_v7, ymm_v4)
  1085          ROTW8_avx2(ymm_tmp0, ymm_v3)
  1086          ROTW8_avx2(ymm_tmp0, ymm_v7)
  1087  
  1088          # c += d; b ^= c; b = ROTW7(b)
  1089          ADD_avx2(ymm_v2, ymm_v3)
  1090          ADD_avx2(ymm_v6, ymm_v7)
  1091          XOR_avx2(ymm_v1, ymm_v2)
  1092          XOR_avx2(ymm_v5, ymm_v6)
  1093          ROTW7_avx2(ymm_tmp0, ymm_v1)
  1094          ROTW7_avx2(ymm_tmp0, ymm_v5)
  1095  
  1096          # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
  1097          VPSHUFD(ymm_v1, ymm_v1, 0x93)
  1098          VPSHUFD(ymm_v5, ymm_v5, 0x93)
  1099          VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  1100          VPSHUFD(ymm_v6, ymm_v6, 0x4e)
  1101          VPSHUFD(ymm_v3, ymm_v3, 0x39)
  1102          VPSHUFD(ymm_v7, ymm_v7, 0x39)
  1103  
  1104          SUB(reg_rounds, 2)
  1105          JNZ(rounds_loop4.begin)
  1106  
  1107      ADD_avx2(ymm_v0, ymm_s0)
  1108      ADD_avx2(ymm_v1, ymm_s1)
  1109      ADD_avx2(ymm_v2, ymm_s2)
  1110      ADD_avx2(ymm_v3, ymm_s3)
  1111      WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
  1112      ADD_avx2(ymm_s3, ymm_inc)
  1113  
  1114      ADD_avx2(ymm_v4, ymm_s0)
  1115      ADD_avx2(ymm_v5, ymm_s1)
  1116      ADD_avx2(ymm_v6, ymm_s2)
  1117      ADD_avx2(ymm_v7, ymm_s3)
  1118      WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
  1119      ADD_avx2(ymm_s3, ymm_inc)
  1120  
  1121      ADD(reg_inp, 4 * 64)
  1122      ADD(reg_outp, 4 * 64)
  1123      SUB(reg_blocks, 4)
  1124  
  1125      LABEL(process_2_blocks)
  1126      ADD(reg_blocks, 4)
  1127      JZ(out_write_even) # 0 blocks left.
  1128  
  1129      #
  1130      # 2/1 blocks at a time.  The two codepaths are unified because
  1131      # with AVX2 we do 2 blocks at a time anyway, and this only gets called
  1132      # if 3/2/1 blocks are remaining, so the extra branches don't hurt that
  1133      # much.
  1134      #
  1135  
  1136      vector_loop2 = Loop()
  1137      with vector_loop2:
  1138          VMOVDQA(ymm_v0, ymm_s0)
  1139          VMOVDQA(ymm_v1, ymm_s1)
  1140          VMOVDQA(ymm_v2, ymm_s2)
  1141          VMOVDQA(ymm_v3, ymm_s3)
  1142  
  1143          MOV(reg_rounds, 20)
  1144          rounds_loop2 = Loop()
  1145          with rounds_loop2:
  1146              # a += b; d ^= a; d = ROTW16(d);
  1147              ADD_avx2(ymm_v0, ymm_v1)
  1148              XOR_avx2(ymm_v3, ymm_v0)
  1149              ROTW16_avx2(ymm_tmp0, ymm_v3)
  1150  
  1151              # c += d; b ^= c; b = ROTW12(b);
  1152              ADD_avx2(ymm_v2, ymm_v3)
  1153              XOR_avx2(ymm_v1, ymm_v2)
  1154              ROTW12_avx2(ymm_tmp0, ymm_v1)
  1155  
  1156              # a += b; d ^= a; d = ROTW8(d);
  1157              ADD_avx2(ymm_v0, ymm_v1)
  1158              XOR_avx2(ymm_v3, ymm_v0)
  1159              ROTW8_avx2(ymm_tmp0, ymm_v3)
  1160  
  1161              # c += d; b ^= c; b = ROTW7(b)
  1162              ADD_avx2(ymm_v2, ymm_v3)
  1163              XOR_avx2(ymm_v1, ymm_v2)
  1164              ROTW7_avx2(ymm_tmp0, ymm_v1)
  1165  
  1166              # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
  1167              VPSHUFD(ymm_v1, ymm_v1, 0x39)
  1168              VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  1169              VPSHUFD(ymm_v3, ymm_v3, 0x93)
  1170  
  1171              # a += b; d ^= a; d = ROTW16(d);
  1172              ADD_avx2(ymm_v0, ymm_v1)
  1173              XOR_avx2(ymm_v3, ymm_v0)
  1174              ROTW16_avx2(ymm_tmp0, ymm_v3)
  1175  
  1176              # c += d; b ^= c; b = ROTW12(b);
  1177              ADD_avx2(ymm_v2, ymm_v3)
  1178              XOR_avx2(ymm_v1, ymm_v2)
  1179              ROTW12_avx2(ymm_tmp0, ymm_v1)
  1180  
  1181              # a += b; d ^= a; d = ROTW8(d);
  1182              ADD_avx2(ymm_v0, ymm_v1)
  1183              XOR_avx2(ymm_v3, ymm_v0)
  1184              ROTW8_avx2(ymm_tmp0, ymm_v3)
  1185  
  1186              # c += d; b ^= c; b = ROTW7(b)
  1187              ADD_avx2(ymm_v2, ymm_v3)
  1188              XOR_avx2(ymm_v1, ymm_v2)
  1189              ROTW7_avx2(ymm_tmp0, ymm_v1)
  1190  
  1191              # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
  1192              VPSHUFD(ymm_v1, ymm_v1, 0x93)
  1193              VPSHUFD(ymm_v2, ymm_v2, 0x4e)
  1194              VPSHUFD(ymm_v3, ymm_v3, 0x39)
  1195  
  1196              SUB(reg_rounds, 2)
  1197              JNZ(rounds_loop2.begin)
  1198  
  1199          ADD_avx2(ymm_v0, ymm_s0)
  1200          ADD_avx2(ymm_v1, ymm_s1)
  1201          ADD_avx2(ymm_v2, ymm_s2)
  1202          ADD_avx2(ymm_v3, ymm_s3)
  1203  
  1204          # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
  1205          VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20)
  1206          VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp])
  1207          VMOVDQU([reg_outp], ymm_tmp0)
  1208  
  1209          # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
  1210          VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20)
  1211          VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32])
  1212          VMOVDQU([reg_outp+32], ymm_tmp0)
  1213  
  1214          SUB(reg_blocks, 1)
  1215          JZ(out_write_odd)
  1216  
  1217          ADD_avx2(ymm_s3, ymm_inc)
  1218  
  1219          # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
  1220          VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31)
  1221          VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64])
  1222          VMOVDQU([reg_outp+64], ymm_tmp0)
  1223  
  1224          # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
  1225          VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31)
  1226          VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96])
  1227          VMOVDQU([reg_outp+96], ymm_tmp0)
  1228  
  1229          SUB(reg_blocks, 1)
  1230          JZ(out_write_even)
  1231  
  1232          ADD(reg_inp, 2 * 64)
  1233          ADD(reg_outp, 2 * 64)
  1234          JMP(vector_loop2.begin)
  1235  
  1236      LABEL(out_write_odd)
  1237      VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks.
  1238  
  1239      LABEL(out_write_even)
  1240      VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
  1241  
  1242      # Paranoia, cleanse the scratch space.
  1243      VPXOR(ymm_v0, ymm_v0, ymm_v0)
  1244      VMOVDQA(mem_tmp0, ymm_v0)
  1245      VMOVDQA(mem_s3, ymm_v0)
  1246  
  1247      # Clear all YMM (and XMM) registers.
  1248      VZEROALL()
  1249  
  1250      # Remove our stack allocation.
  1251      MOV(registers.rsp, reg_sp_save)
  1252  
  1253      RETURN()
  1254  
  1255  #
  1256  # CPUID
  1257  #
  1258  
  1259  cpuidParams = Argument(ptr(uint32_t))
  1260  
  1261  with Function("cpuidAmd64", (cpuidParams,)):
  1262      reg_params = registers.r15
  1263      LOAD.ARGUMENT(reg_params, cpuidParams)
  1264  
  1265      MOV(registers.eax, [reg_params])
  1266      MOV(registers.ecx, [reg_params+8])
  1267  
  1268      CPUID()
  1269  
  1270      MOV([reg_params], registers.eax)
  1271      MOV([reg_params+4], registers.ebx)
  1272      MOV([reg_params+8], registers.ecx)
  1273      MOV([reg_params+12], registers.edx)
  1274  
  1275      RETURN()
  1276  
  1277  #
  1278  # XGETBV (ECX = 0)
  1279  #
  1280  
  1281  xcrVec = Argument(ptr(uint32_t))
  1282  
  1283  with Function("xgetbv0Amd64", (xcrVec,)):
  1284      reg_vec = GeneralPurposeRegister64()
  1285  
  1286      LOAD.ARGUMENT(reg_vec, xcrVec)
  1287  
  1288      XOR(registers.ecx, registers.ecx)
  1289  
  1290      XGETBV()
  1291  
  1292      MOV([reg_vec], registers.eax)
  1293      MOV([reg_vec+4], registers.edx)
  1294  
  1295      RETURN()