git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/asm/chacha_x86_64.pl (about)

     1  #! /usr/bin/env perl
     2  # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
     3  #
     4  # Licensed under the OpenSSL license (the "License").  You may not use
     5  # this file except in compliance with the License.  You can obtain a copy
     6  # in the file LICENSE in the source distribution or at
     7  # https://www.openssl.org/source/license.html
     8  
     9  #
    10  # ====================================================================
    11  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    12  # project. The module is, however, dual licensed under OpenSSL and
    13  # CRYPTOGAMS licenses depending on where you obtain it. For further
    14  # details see http://www.openssl.org/~appro/cryptogams/.
    15  # ====================================================================
    16  #
    17  # November 2014
    18  #
    19  # ChaCha20 for x86_64.
    20  #
    21  # December 2016
    22  #
    23  # Add AVX512F code path.
    24  #
    25  # Performance in cycles per byte out of large buffer.
    26  #
    27  #		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    NxAVX(v)
    28  #
    29  # P4		9.48/+99%	-/22.7(ii)	-
    30  # Core2		7.83/+55%	7.90/8.08	4.35
    31  # Westmere	7.19/+50%	5.60/6.70	3.00
    32  # Sandy Bridge	8.31/+42%	5.45/6.76	2.72
    33  # Ivy Bridge	6.71/+46%	5.40/6.49	2.41
    34  # Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
    35  # Skylake[-X]	5.87/+39%	4.70/-		2.31	    1.19[0.57]
    36  # Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
    37  # Knights L	11.7/-		-		9.60(iii)   0.80
    38  # Goldmont	10.6/+17%	5.10/-		3.28
    39  # Sledgehammer	7.28/+52%	-/14.2(ii)	-
    40  # Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
    41  # Ryzen		5.96/+50%	5.19/-		2.40        2.09
    42  # VIA Nano	10.5/+46%	6.72/8.60	6.05
    43  #
    44  # (i)	compared to older gcc 3.x one can observe >2x improvement on
    45  #	most platforms;
    46  # (ii)	as it can be seen, SSE2 performance is too low on legacy
    47  #	processors; NxSSE2 results are naturally better, but not
    48  #	impressively better than IALU ones, which is why you won't
    49  #	find SSE2 code below;
    50  # (iii)	this is not optimal result for Atom because of MSROM
    51  #	limitations, SSE2 can do better, but gain is considered too
    52  #	low to justify the [maintenance] effort;
    53  # (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20;
    54  #
    55  # Modified from upstream OpenSSL to remove the XOP code.
    56  
    57  $flavour = shift;
    58  $output  = shift;
    59  if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
    60  
    61  $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
    62  
    63  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    64  ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
    65  ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
    66  die "can't locate x86_64-xlate.pl";
    67  
    68  $avx = 2;
    69  
    70  open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
    71  *STDOUT=*OUT;
    72  
    73  # input parameter block
    74  ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
    75  
    76  $code.=<<___;
    77  .text
    78  
    79  .extern OPENSSL_ia32cap_P
    80  
    81  .section .rodata
    82  .align	64
    83  .Lzero:
    84  .long	0,0,0,0
    85  .Lone:
    86  .long	1,0,0,0
    87  .Linc:
    88  .long	0,1,2,3
    89  .Lfour:
    90  .long	4,4,4,4
    91  .Lincy:
    92  .long	0,2,4,6,1,3,5,7
    93  .Leight:
    94  .long	8,8,8,8,8,8,8,8
    95  .Lrot16:
    96  .byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
    97  .Lrot24:
    98  .byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
    99  .Lsigma:
   100  .asciz	"expand 32-byte k"
   101  .align	64
   102  .Lzeroz:
   103  .long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
   104  .Lfourz:
   105  .long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
   106  .Lincz:
   107  .long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
   108  .Lsixteen:
   109  .long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
   110  .asciz	"ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
   111  .text
   112  ___
   113  
   114  sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
   115  { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
   116    my $arg = pop;
   117      $arg = "\$$arg" if ($arg*1 eq $arg);
   118      $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
   119  }
   120  
   121  @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
   122      "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
   123  @t=("%esi","%edi");
   124  
   125  sub ROUND {			# critical path is 24 cycles per round
   126  my ($a0,$b0,$c0,$d0)=@_;
   127  my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
   128  my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
   129  my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
   130  my ($xc,$xc_)=map("\"$_\"",@t);
   131  my @x=map("\"$_\"",@x);
   132  
   133  	# Consider order in which variables are addressed by their
   134  	# index:
   135  	#
   136  	#	a   b   c   d
   137  	#
   138  	#	0   4   8  12 < even round
   139  	#	1   5   9  13
   140  	#	2   6  10  14
   141  	#	3   7  11  15
   142  	#	0   5  10  15 < odd round
   143  	#	1   6  11  12
   144  	#	2   7   8  13
   145  	#	3   4   9  14
   146  	#
   147  	# 'a', 'b' and 'd's are permanently allocated in registers,
   148  	# @x[0..7,12..15], while 'c's are maintained in memory. If
   149  	# you observe 'c' column, you'll notice that pair of 'c's is
   150  	# invariant between rounds. This means that we have to reload
   151  	# them once per round, in the middle. This is why you'll see
   152  	# bunch of 'c' stores and loads in the middle, but none in
   153  	# the beginning or end.
   154  
   155  	# Normally instructions would be interleaved to favour in-order
   156  	# execution. Generally out-of-order cores manage it gracefully,
   157  	# but not this time for some reason. As in-order execution
   158  	# cores are dying breed, old Atom is the only one around,
   159  	# instructions are left uninterleaved. Besides, Atom is better
   160  	# off executing 1xSSSE3 code anyway...
   161  
   162  	(
   163  	"&add	(@x[$a0],@x[$b0])",	# Q1
   164  	"&xor	(@x[$d0],@x[$a0])",
   165  	"&rol	(@x[$d0],16)",
   166  	 "&add	(@x[$a1],@x[$b1])",	# Q2
   167  	 "&xor	(@x[$d1],@x[$a1])",
   168  	 "&rol	(@x[$d1],16)",
   169  
   170  	"&add	($xc,@x[$d0])",
   171  	"&xor	(@x[$b0],$xc)",
   172  	"&rol	(@x[$b0],12)",
   173  	 "&add	($xc_,@x[$d1])",
   174  	 "&xor	(@x[$b1],$xc_)",
   175  	 "&rol	(@x[$b1],12)",
   176  
   177  	"&add	(@x[$a0],@x[$b0])",
   178  	"&xor	(@x[$d0],@x[$a0])",
   179  	"&rol	(@x[$d0],8)",
   180  	 "&add	(@x[$a1],@x[$b1])",
   181  	 "&xor	(@x[$d1],@x[$a1])",
   182  	 "&rol	(@x[$d1],8)",
   183  
   184  	"&add	($xc,@x[$d0])",
   185  	"&xor	(@x[$b0],$xc)",
   186  	"&rol	(@x[$b0],7)",
   187  	 "&add	($xc_,@x[$d1])",
   188  	 "&xor	(@x[$b1],$xc_)",
   189  	 "&rol	(@x[$b1],7)",
   190  
   191  	"&mov	(\"4*$c0(%rsp)\",$xc)",	# reload pair of 'c's
   192  	 "&mov	(\"4*$c1(%rsp)\",$xc_)",
   193  	"&mov	($xc,\"4*$c2(%rsp)\")",
   194  	 "&mov	($xc_,\"4*$c3(%rsp)\")",
   195  
   196  	"&add	(@x[$a2],@x[$b2])",	# Q3
   197  	"&xor	(@x[$d2],@x[$a2])",
   198  	"&rol	(@x[$d2],16)",
   199  	 "&add	(@x[$a3],@x[$b3])",	# Q4
   200  	 "&xor	(@x[$d3],@x[$a3])",
   201  	 "&rol	(@x[$d3],16)",
   202  
   203  	"&add	($xc,@x[$d2])",
   204  	"&xor	(@x[$b2],$xc)",
   205  	"&rol	(@x[$b2],12)",
   206  	 "&add	($xc_,@x[$d3])",
   207  	 "&xor	(@x[$b3],$xc_)",
   208  	 "&rol	(@x[$b3],12)",
   209  
   210  	"&add	(@x[$a2],@x[$b2])",
   211  	"&xor	(@x[$d2],@x[$a2])",
   212  	"&rol	(@x[$d2],8)",
   213  	 "&add	(@x[$a3],@x[$b3])",
   214  	 "&xor	(@x[$d3],@x[$a3])",
   215  	 "&rol	(@x[$d3],8)",
   216  
   217  	"&add	($xc,@x[$d2])",
   218  	"&xor	(@x[$b2],$xc)",
   219  	"&rol	(@x[$b2],7)",
   220  	 "&add	($xc_,@x[$d3])",
   221  	 "&xor	(@x[$b3],$xc_)",
   222  	 "&rol	(@x[$b3],7)"
   223  	);
   224  }
   225  
   226  ########################################################################
   227  # Generic code path that handles all lengths on pre-SSSE3 processors.
   228  $code.=<<___;
   229  .globl	ChaCha20_ctr32
   230  .type	ChaCha20_ctr32,\@function,5
   231  .align	64
   232  ChaCha20_ctr32:
   233  .cfi_startproc
   234  	_CET_ENDBR
   235  	cmp	\$0,$len
   236  	je	.Lno_data
   237  	mov	OPENSSL_ia32cap_P+4(%rip),%r10
   238  ___
   239  $code.=<<___;
   240  	test	\$`1<<(41-32)`,%r10d
   241  	jnz	.LChaCha20_ssse3
   242  
   243  	push	%rbx
   244  .cfi_push	rbx
   245  	push	%rbp
   246  .cfi_push	rbp
   247  	push	%r12
   248  .cfi_push	r12
   249  	push	%r13
   250  .cfi_push	r13
   251  	push	%r14
   252  .cfi_push	r14
   253  	push	%r15
   254  .cfi_push	r15
   255  	sub	\$64+24,%rsp
   256  .cfi_adjust_cfa_offset	`64+24`
   257  .Lctr32_body:
   258  
   259  	#movdqa	.Lsigma(%rip),%xmm0
   260  	movdqu	($key),%xmm1
   261  	movdqu	16($key),%xmm2
   262  	movdqu	($counter),%xmm3
   263  	movdqa	.Lone(%rip),%xmm4
   264  
   265  	#movdqa	%xmm0,4*0(%rsp)		# key[0]
   266  	movdqa	%xmm1,4*4(%rsp)		# key[1]
   267  	movdqa	%xmm2,4*8(%rsp)		# key[2]
   268  	movdqa	%xmm3,4*12(%rsp)	# key[3]
   269  	mov	$len,%rbp		# reassign $len
   270  	jmp	.Loop_outer
   271  
   272  .align	32
   273  .Loop_outer:
   274  	mov	\$0x61707865,@x[0]      # 'expa'
   275  	mov	\$0x3320646e,@x[1]      # 'nd 3'
   276  	mov	\$0x79622d32,@x[2]      # '2-by'
   277  	mov	\$0x6b206574,@x[3]      # 'te k'
   278  	mov	4*4(%rsp),@x[4]
   279  	mov	4*5(%rsp),@x[5]
   280  	mov	4*6(%rsp),@x[6]
   281  	mov	4*7(%rsp),@x[7]
   282  	movd	%xmm3,@x[12]
   283  	mov	4*13(%rsp),@x[13]
   284  	mov	4*14(%rsp),@x[14]
   285  	mov	4*15(%rsp),@x[15]
   286  
   287  	mov	%rbp,64+0(%rsp)		# save len
   288  	mov	\$10,%ebp
   289  	mov	$inp,64+8(%rsp)		# save inp
   290  	movq	%xmm2,%rsi		# "@x[8]"
   291  	mov	$out,64+16(%rsp)	# save out
   292  	mov	%rsi,%rdi
   293  	shr	\$32,%rdi		# "@x[9]"
   294  	jmp	.Loop
   295  
   296  .align	32
   297  .Loop:
   298  ___
   299  	foreach (&ROUND (0, 4, 8,12)) { eval; }
   300  	foreach (&ROUND	(0, 5,10,15)) { eval; }
   301  	&dec	("%ebp");
   302  	&jnz	(".Loop");
   303  
   304  $code.=<<___;
   305  	mov	@t[1],4*9(%rsp)		# modulo-scheduled
   306  	mov	@t[0],4*8(%rsp)
   307  	mov	64(%rsp),%rbp		# load len
   308  	movdqa	%xmm2,%xmm1
   309  	mov	64+8(%rsp),$inp		# load inp
   310  	paddd	%xmm4,%xmm3		# increment counter
   311  	mov	64+16(%rsp),$out	# load out
   312  
   313  	add	\$0x61707865,@x[0]      # 'expa'
   314  	add	\$0x3320646e,@x[1]      # 'nd 3'
   315  	add	\$0x79622d32,@x[2]      # '2-by'
   316  	add	\$0x6b206574,@x[3]      # 'te k'
   317  	add	4*4(%rsp),@x[4]
   318  	add	4*5(%rsp),@x[5]
   319  	add	4*6(%rsp),@x[6]
   320  	add	4*7(%rsp),@x[7]
   321  	add	4*12(%rsp),@x[12]
   322  	add	4*13(%rsp),@x[13]
   323  	add	4*14(%rsp),@x[14]
   324  	add	4*15(%rsp),@x[15]
   325  	paddd	4*8(%rsp),%xmm1
   326  
   327  	cmp	\$64,%rbp
   328  	jb	.Ltail
   329  
   330  	xor	4*0($inp),@x[0]		# xor with input
   331  	xor	4*1($inp),@x[1]
   332  	xor	4*2($inp),@x[2]
   333  	xor	4*3($inp),@x[3]
   334  	xor	4*4($inp),@x[4]
   335  	xor	4*5($inp),@x[5]
   336  	xor	4*6($inp),@x[6]
   337  	xor	4*7($inp),@x[7]
   338  	movdqu	4*8($inp),%xmm0
   339  	xor	4*12($inp),@x[12]
   340  	xor	4*13($inp),@x[13]
   341  	xor	4*14($inp),@x[14]
   342  	xor	4*15($inp),@x[15]
   343  	lea	4*16($inp),$inp		# inp+=64
   344  	pxor	%xmm1,%xmm0
   345  
   346  	movdqa	%xmm2,4*8(%rsp)
   347  	movd	%xmm3,4*12(%rsp)
   348  
   349  	mov	@x[0],4*0($out)		# write output
   350  	mov	@x[1],4*1($out)
   351  	mov	@x[2],4*2($out)
   352  	mov	@x[3],4*3($out)
   353  	mov	@x[4],4*4($out)
   354  	mov	@x[5],4*5($out)
   355  	mov	@x[6],4*6($out)
   356  	mov	@x[7],4*7($out)
   357  	movdqu	%xmm0,4*8($out)
   358  	mov	@x[12],4*12($out)
   359  	mov	@x[13],4*13($out)
   360  	mov	@x[14],4*14($out)
   361  	mov	@x[15],4*15($out)
   362  	lea	4*16($out),$out		# out+=64
   363  
   364  	sub	\$64,%rbp
   365  	jnz	.Loop_outer
   366  
   367  	jmp	.Ldone
   368  
   369  .align	16
   370  .Ltail:
   371  	mov	@x[0],4*0(%rsp)
   372  	mov	@x[1],4*1(%rsp)
   373  	xor	%rbx,%rbx
   374  	mov	@x[2],4*2(%rsp)
   375  	mov	@x[3],4*3(%rsp)
   376  	mov	@x[4],4*4(%rsp)
   377  	mov	@x[5],4*5(%rsp)
   378  	mov	@x[6],4*6(%rsp)
   379  	mov	@x[7],4*7(%rsp)
   380  	movdqa	%xmm1,4*8(%rsp)
   381  	mov	@x[12],4*12(%rsp)
   382  	mov	@x[13],4*13(%rsp)
   383  	mov	@x[14],4*14(%rsp)
   384  	mov	@x[15],4*15(%rsp)
   385  
   386  .Loop_tail:
   387  	movzb	($inp,%rbx),%eax
   388  	movzb	(%rsp,%rbx),%edx
   389  	lea	1(%rbx),%rbx
   390  	xor	%edx,%eax
   391  	mov	%al,-1($out,%rbx)
   392  	dec	%rbp
   393  	jnz	.Loop_tail
   394  
   395  .Ldone:
   396  	lea	64+24+48(%rsp),%rsi
   397  	mov	-48(%rsi),%r15
   398  .cfi_restore	r15
   399  	mov	-40(%rsi),%r14
   400  .cfi_restore	r14
   401  	mov	-32(%rsi),%r13
   402  .cfi_restore	r13
   403  	mov	-24(%rsi),%r12
   404  .cfi_restore	r12
   405  	mov	-16(%rsi),%rbp
   406  .cfi_restore	rbp
   407  	mov	-8(%rsi),%rbx
   408  .cfi_restore	rbx
   409  	lea	(%rsi),%rsp
   410  .cfi_adjust_cfa_offset	`-64-24-48`
   411  .Lno_data:
   412  	ret
   413  .cfi_endproc
   414  .size	ChaCha20_ctr32,.-ChaCha20_ctr32
   415  ___
   416  
   417  ########################################################################
   418  # SSSE3 code path that handles shorter lengths
   419  {
   420  my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
   421  
   422  sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
   423  	&paddd	($a,$b);
   424  	&pxor	($d,$a);
   425  	&pshufb	($d,$rot16);
   426  
   427  	&paddd	($c,$d);
   428  	&pxor	($b,$c);
   429  	&movdqa	($t,$b);
   430  	&psrld	($b,20);
   431  	&pslld	($t,12);
   432  	&por	($b,$t);
   433  
   434  	&paddd	($a,$b);
   435  	&pxor	($d,$a);
   436  	&pshufb	($d,$rot24);
   437  
   438  	&paddd	($c,$d);
   439  	&pxor	($b,$c);
   440  	&movdqa	($t,$b);
   441  	&psrld	($b,25);
   442  	&pslld	($t,7);
   443  	&por	($b,$t);
   444  }
   445  
   446  my $xframe = $win64 ? 32+8 : 8;
   447  
   448  $code.=<<___;
   449  .type	ChaCha20_ssse3,\@function,5
   450  .align	32
   451  ChaCha20_ssse3:
   452  .LChaCha20_ssse3:
   453  .cfi_startproc
   454  	mov	%rsp,%r9		# frame pointer
   455  .cfi_def_cfa_register	r9
   456  ___
   457  $code.=<<___;
   458  	cmp	\$128,$len		# we might throw away some data,
   459  	ja	.LChaCha20_4x		# but overall it won't be slower
   460  
   461  .Ldo_sse3_after_all:
   462  	sub	\$64+$xframe,%rsp
   463  ___
   464  $code.=<<___	if ($win64);
   465  	movaps	%xmm6,-0x28(%r9)
   466  	movaps	%xmm7,-0x18(%r9)
   467  .Lssse3_body:
   468  ___
   469  $code.=<<___;
   470  	movdqa	.Lsigma(%rip),$a
   471  	movdqu	($key),$b
   472  	movdqu	16($key),$c
   473  	movdqu	($counter),$d
   474  	movdqa	.Lrot16(%rip),$rot16
   475  	movdqa	.Lrot24(%rip),$rot24
   476  
   477  	movdqa	$a,0x00(%rsp)
   478  	movdqa	$b,0x10(%rsp)
   479  	movdqa	$c,0x20(%rsp)
   480  	movdqa	$d,0x30(%rsp)
   481  	mov	\$10,$counter		# reuse $counter
   482  	jmp	.Loop_ssse3
   483  
   484  .align	32
   485  .Loop_outer_ssse3:
   486  	movdqa	.Lone(%rip),$d
   487  	movdqa	0x00(%rsp),$a
   488  	movdqa	0x10(%rsp),$b
   489  	movdqa	0x20(%rsp),$c
   490  	paddd	0x30(%rsp),$d
   491  	mov	\$10,$counter
   492  	movdqa	$d,0x30(%rsp)
   493  	jmp	.Loop_ssse3
   494  
   495  .align	32
   496  .Loop_ssse3:
   497  ___
   498  	&SSSE3ROUND();
   499  	&pshufd	($c,$c,0b01001110);
   500  	&pshufd	($b,$b,0b00111001);
   501  	&pshufd	($d,$d,0b10010011);
   502  	&nop	();
   503  
   504  	&SSSE3ROUND();
   505  	&pshufd	($c,$c,0b01001110);
   506  	&pshufd	($b,$b,0b10010011);
   507  	&pshufd	($d,$d,0b00111001);
   508  
   509  	&dec	($counter);
   510  	&jnz	(".Loop_ssse3");
   511  
   512  $code.=<<___;
   513  	paddd	0x00(%rsp),$a
   514  	paddd	0x10(%rsp),$b
   515  	paddd	0x20(%rsp),$c
   516  	paddd	0x30(%rsp),$d
   517  
   518  	cmp	\$64,$len
   519  	jb	.Ltail_ssse3
   520  
   521  	movdqu	0x00($inp),$t
   522  	movdqu	0x10($inp),$t1
   523  	pxor	$t,$a			# xor with input
   524  	movdqu	0x20($inp),$t
   525  	pxor	$t1,$b
   526  	movdqu	0x30($inp),$t1
   527  	lea	0x40($inp),$inp		# inp+=64
   528  	pxor	$t,$c
   529  	pxor	$t1,$d
   530  
   531  	movdqu	$a,0x00($out)		# write output
   532  	movdqu	$b,0x10($out)
   533  	movdqu	$c,0x20($out)
   534  	movdqu	$d,0x30($out)
   535  	lea	0x40($out),$out		# out+=64
   536  
   537  	sub	\$64,$len
   538  	jnz	.Loop_outer_ssse3
   539  
   540  	jmp	.Ldone_ssse3
   541  
   542  .align	16
   543  .Ltail_ssse3:
   544  	movdqa	$a,0x00(%rsp)
   545  	movdqa	$b,0x10(%rsp)
   546  	movdqa	$c,0x20(%rsp)
   547  	movdqa	$d,0x30(%rsp)
   548  	xor	$counter,$counter
   549  
   550  .Loop_tail_ssse3:
   551  	movzb	($inp,$counter),%eax
   552  	movzb	(%rsp,$counter),%ecx
   553  	lea	1($counter),$counter
   554  	xor	%ecx,%eax
   555  	mov	%al,-1($out,$counter)
   556  	dec	$len
   557  	jnz	.Loop_tail_ssse3
   558  
   559  .Ldone_ssse3:
   560  ___
   561  $code.=<<___	if ($win64);
   562  	movaps	-0x28(%r9),%xmm6
   563  	movaps	-0x18(%r9),%xmm7
   564  ___
   565  $code.=<<___;
   566  	lea	(%r9),%rsp
   567  .cfi_def_cfa_register	rsp
   568  .Lssse3_epilogue:
   569  	ret
   570  .cfi_endproc
   571  .size	ChaCha20_ssse3,.-ChaCha20_ssse3
   572  ___
   573  }
   574  
   575  ########################################################################
   576  # SSSE3 code path that handles longer messages.
   577  {
   578  # assign variables to favor Atom front-end
   579  my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
   580      $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
   581  my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
   582  	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
   583  
   584  sub SSSE3_lane_ROUND {
   585  my ($a0,$b0,$c0,$d0)=@_;
   586  my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
   587  my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
   588  my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
   589  my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
   590  my @x=map("\"$_\"",@xx);
   591  
   592  	# Consider order in which variables are addressed by their
   593  	# index:
   594  	#
   595  	#	a   b   c   d
   596  	#
   597  	#	0   4   8  12 < even round
   598  	#	1   5   9  13
   599  	#	2   6  10  14
   600  	#	3   7  11  15
   601  	#	0   5  10  15 < odd round
   602  	#	1   6  11  12
   603  	#	2   7   8  13
   604  	#	3   4   9  14
   605  	#
   606  	# 'a', 'b' and 'd's are permanently allocated in registers,
   607  	# @x[0..7,12..15], while 'c's are maintained in memory. If
   608  	# you observe 'c' column, you'll notice that pair of 'c's is
   609  	# invariant between rounds. This means that we have to reload
   610  	# them once per round, in the middle. This is why you'll see
   611  	# bunch of 'c' stores and loads in the middle, but none in
   612  	# the beginning or end.
   613  
   614  	(
   615  	"&paddd		(@x[$a0],@x[$b0])",	# Q1
   616  	 "&paddd	(@x[$a1],@x[$b1])",	# Q2
   617  	"&pxor		(@x[$d0],@x[$a0])",
   618  	 "&pxor		(@x[$d1],@x[$a1])",
   619  	"&pshufb	(@x[$d0],$t1)",
   620  	 "&pshufb	(@x[$d1],$t1)",
   621  
   622  	"&paddd		($xc,@x[$d0])",
   623  	 "&paddd	($xc_,@x[$d1])",
   624  	"&pxor		(@x[$b0],$xc)",
   625  	 "&pxor		(@x[$b1],$xc_)",
   626  	"&movdqa	($t0,@x[$b0])",
   627  	"&pslld		(@x[$b0],12)",
   628  	"&psrld		($t0,20)",
   629  	 "&movdqa	($t1,@x[$b1])",
   630  	 "&pslld	(@x[$b1],12)",
   631  	"&por		(@x[$b0],$t0)",
   632  	 "&psrld	($t1,20)",
   633  	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
   634  	 "&por		(@x[$b1],$t1)",
   635  
   636  	"&paddd		(@x[$a0],@x[$b0])",
   637  	 "&paddd	(@x[$a1],@x[$b1])",
   638  	"&pxor		(@x[$d0],@x[$a0])",
   639  	 "&pxor		(@x[$d1],@x[$a1])",
   640  	"&pshufb	(@x[$d0],$t0)",
   641  	 "&pshufb	(@x[$d1],$t0)",
   642  
   643  	"&paddd		($xc,@x[$d0])",
   644  	 "&paddd	($xc_,@x[$d1])",
   645  	"&pxor		(@x[$b0],$xc)",
   646  	 "&pxor		(@x[$b1],$xc_)",
   647  	"&movdqa	($t1,@x[$b0])",
   648  	"&pslld		(@x[$b0],7)",
   649  	"&psrld		($t1,25)",
   650  	 "&movdqa	($t0,@x[$b1])",
   651  	 "&pslld	(@x[$b1],7)",
   652  	"&por		(@x[$b0],$t1)",
   653  	 "&psrld	($t0,25)",
   654  	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
   655  	 "&por		(@x[$b1],$t0)",
   656  
   657  	"&movdqa	(\"`16*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
   658  	 "&movdqa	(\"`16*($c1-8)`(%rsp)\",$xc_)",
   659  	"&movdqa	($xc,\"`16*($c2-8)`(%rsp)\")",
   660  	 "&movdqa	($xc_,\"`16*($c3-8)`(%rsp)\")",
   661  
   662  	"&paddd		(@x[$a2],@x[$b2])",	# Q3
   663  	 "&paddd	(@x[$a3],@x[$b3])",	# Q4
   664  	"&pxor		(@x[$d2],@x[$a2])",
   665  	 "&pxor		(@x[$d3],@x[$a3])",
   666  	"&pshufb	(@x[$d2],$t1)",
   667  	 "&pshufb	(@x[$d3],$t1)",
   668  
   669  	"&paddd		($xc,@x[$d2])",
   670  	 "&paddd	($xc_,@x[$d3])",
   671  	"&pxor		(@x[$b2],$xc)",
   672  	 "&pxor		(@x[$b3],$xc_)",
   673  	"&movdqa	($t0,@x[$b2])",
   674  	"&pslld		(@x[$b2],12)",
   675  	"&psrld		($t0,20)",
   676  	 "&movdqa	($t1,@x[$b3])",
   677  	 "&pslld	(@x[$b3],12)",
   678  	"&por		(@x[$b2],$t0)",
   679  	 "&psrld	($t1,20)",
   680  	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
   681  	 "&por		(@x[$b3],$t1)",
   682  
   683  	"&paddd		(@x[$a2],@x[$b2])",
   684  	 "&paddd	(@x[$a3],@x[$b3])",
   685  	"&pxor		(@x[$d2],@x[$a2])",
   686  	 "&pxor		(@x[$d3],@x[$a3])",
   687  	"&pshufb	(@x[$d2],$t0)",
   688  	 "&pshufb	(@x[$d3],$t0)",
   689  
   690  	"&paddd		($xc,@x[$d2])",
   691  	 "&paddd	($xc_,@x[$d3])",
   692  	"&pxor		(@x[$b2],$xc)",
   693  	 "&pxor		(@x[$b3],$xc_)",
   694  	"&movdqa	($t1,@x[$b2])",
   695  	"&pslld		(@x[$b2],7)",
   696  	"&psrld		($t1,25)",
   697  	 "&movdqa	($t0,@x[$b3])",
   698  	 "&pslld	(@x[$b3],7)",
   699  	"&por		(@x[$b2],$t1)",
   700  	 "&psrld	($t0,25)",
   701  	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
   702  	 "&por		(@x[$b3],$t0)"
   703  	);
   704  }
   705  
   706  my $xframe = $win64 ? 0xa8 : 8;
   707  
   708  $code.=<<___;
   709  .type	ChaCha20_4x,\@function,5
   710  .align	32
   711  ChaCha20_4x:
   712  .LChaCha20_4x:
   713  .cfi_startproc
   714  	mov		%rsp,%r9		# frame pointer
   715  .cfi_def_cfa_register	r9
   716  	mov		%r10,%r11
   717  ___
   718  $code.=<<___	if ($avx>1);
   719  	shr		\$32,%r10		# OPENSSL_ia32cap_P+8
   720  	test		\$`1<<5`,%r10		# test AVX2
   721  	jnz		.LChaCha20_8x
   722  ___
   723  $code.=<<___;
   724  	cmp		\$192,$len
   725  	ja		.Lproceed4x
   726  
   727  	and		\$`1<<26|1<<22`,%r11	# isolate XSAVE+MOVBE
   728  	cmp		\$`1<<22`,%r11		# check for MOVBE without XSAVE
   729  	je		.Ldo_sse3_after_all	# to detect Atom
   730  
   731  .Lproceed4x:
   732  	sub		\$0x140+$xframe,%rsp
   733  ___
   734  	################ stack layout
   735  	# +0x00		SIMD equivalent of @x[8-12]
   736  	# ...
   737  	# +0x40		constant copy of key[0-2] smashed by lanes
   738  	# ...
   739  	# +0x100	SIMD counters (with nonce smashed by lanes)
   740  	# ...
   741  	# +0x140
   742  $code.=<<___	if ($win64);
   743  	movaps		%xmm6,-0xa8(%r9)
   744  	movaps		%xmm7,-0x98(%r9)
   745  	movaps		%xmm8,-0x88(%r9)
   746  	movaps		%xmm9,-0x78(%r9)
   747  	movaps		%xmm10,-0x68(%r9)
   748  	movaps		%xmm11,-0x58(%r9)
   749  	movaps		%xmm12,-0x48(%r9)
   750  	movaps		%xmm13,-0x38(%r9)
   751  	movaps		%xmm14,-0x28(%r9)
   752  	movaps		%xmm15,-0x18(%r9)
   753  .L4x_body:
   754  ___
   755  $code.=<<___;
   756  	movdqa		.Lsigma(%rip),$xa3	# key[0]
   757  	movdqu		($key),$xb3		# key[1]
   758  	movdqu		16($key),$xt3		# key[2]
   759  	movdqu		($counter),$xd3		# key[3]
   760  	lea		0x100(%rsp),%rcx	# size optimization
   761  	lea		.Lrot16(%rip),%r10
   762  	lea		.Lrot24(%rip),%r11
   763  
   764  	pshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
   765  	pshufd		\$0x55,$xa3,$xa1
   766  	movdqa		$xa0,0x40(%rsp)		# ... and offload
   767  	pshufd		\$0xaa,$xa3,$xa2
   768  	movdqa		$xa1,0x50(%rsp)
   769  	pshufd		\$0xff,$xa3,$xa3
   770  	movdqa		$xa2,0x60(%rsp)
   771  	movdqa		$xa3,0x70(%rsp)
   772  
   773  	pshufd		\$0x00,$xb3,$xb0
   774  	pshufd		\$0x55,$xb3,$xb1
   775  	movdqa		$xb0,0x80-0x100(%rcx)
   776  	pshufd		\$0xaa,$xb3,$xb2
   777  	movdqa		$xb1,0x90-0x100(%rcx)
   778  	pshufd		\$0xff,$xb3,$xb3
   779  	movdqa		$xb2,0xa0-0x100(%rcx)
   780  	movdqa		$xb3,0xb0-0x100(%rcx)
   781  
   782  	pshufd		\$0x00,$xt3,$xt0	# "$xc0"
   783  	pshufd		\$0x55,$xt3,$xt1	# "$xc1"
   784  	movdqa		$xt0,0xc0-0x100(%rcx)
   785  	pshufd		\$0xaa,$xt3,$xt2	# "$xc2"
   786  	movdqa		$xt1,0xd0-0x100(%rcx)
   787  	pshufd		\$0xff,$xt3,$xt3	# "$xc3"
   788  	movdqa		$xt2,0xe0-0x100(%rcx)
   789  	movdqa		$xt3,0xf0-0x100(%rcx)
   790  
   791  	pshufd		\$0x00,$xd3,$xd0
   792  	pshufd		\$0x55,$xd3,$xd1
   793  	paddd		.Linc(%rip),$xd0	# don't save counters yet
   794  	pshufd		\$0xaa,$xd3,$xd2
   795  	movdqa		$xd1,0x110-0x100(%rcx)
   796  	pshufd		\$0xff,$xd3,$xd3
   797  	movdqa		$xd2,0x120-0x100(%rcx)
   798  	movdqa		$xd3,0x130-0x100(%rcx)
   799  
   800  	jmp		.Loop_enter4x
   801  
   802  .align	32
   803  .Loop_outer4x:
   804  	movdqa		0x40(%rsp),$xa0		# re-load smashed key
   805  	movdqa		0x50(%rsp),$xa1
   806  	movdqa		0x60(%rsp),$xa2
   807  	movdqa		0x70(%rsp),$xa3
   808  	movdqa		0x80-0x100(%rcx),$xb0
   809  	movdqa		0x90-0x100(%rcx),$xb1
   810  	movdqa		0xa0-0x100(%rcx),$xb2
   811  	movdqa		0xb0-0x100(%rcx),$xb3
   812  	movdqa		0xc0-0x100(%rcx),$xt0	# "$xc0"
   813  	movdqa		0xd0-0x100(%rcx),$xt1	# "$xc1"
   814  	movdqa		0xe0-0x100(%rcx),$xt2	# "$xc2"
   815  	movdqa		0xf0-0x100(%rcx),$xt3	# "$xc3"
   816  	movdqa		0x100-0x100(%rcx),$xd0
   817  	movdqa		0x110-0x100(%rcx),$xd1
   818  	movdqa		0x120-0x100(%rcx),$xd2
   819  	movdqa		0x130-0x100(%rcx),$xd3
   820  	paddd		.Lfour(%rip),$xd0	# next SIMD counters
   821  
   822  .Loop_enter4x:
   823  	movdqa		$xt2,0x20(%rsp)		# SIMD equivalent of "@x[10]"
   824  	movdqa		$xt3,0x30(%rsp)		# SIMD equivalent of "@x[11]"
   825  	movdqa		(%r10),$xt3		# .Lrot16(%rip)
   826  	mov		\$10,%eax
   827  	movdqa		$xd0,0x100-0x100(%rcx)	# save SIMD counters
   828  	jmp		.Loop4x
   829  
   830  .align	32
   831  .Loop4x:
   832  ___
   833  	foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
   834  	foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
   835  $code.=<<___;
   836  	dec		%eax
   837  	jnz		.Loop4x
   838  
   839  	paddd		0x40(%rsp),$xa0		# accumulate key material
   840  	paddd		0x50(%rsp),$xa1
   841  	paddd		0x60(%rsp),$xa2
   842  	paddd		0x70(%rsp),$xa3
   843  
   844  	movdqa		$xa0,$xt2		# "de-interlace" data
   845  	punpckldq	$xa1,$xa0
   846  	movdqa		$xa2,$xt3
   847  	punpckldq	$xa3,$xa2
   848  	punpckhdq	$xa1,$xt2
   849  	punpckhdq	$xa3,$xt3
   850  	movdqa		$xa0,$xa1
   851  	punpcklqdq	$xa2,$xa0		# "a0"
   852  	movdqa		$xt2,$xa3
   853  	punpcklqdq	$xt3,$xt2		# "a2"
   854  	punpckhqdq	$xa2,$xa1		# "a1"
   855  	punpckhqdq	$xt3,$xa3		# "a3"
   856  ___
   857  	($xa2,$xt2)=($xt2,$xa2);
   858  $code.=<<___;
   859  	paddd		0x80-0x100(%rcx),$xb0
   860  	paddd		0x90-0x100(%rcx),$xb1
   861  	paddd		0xa0-0x100(%rcx),$xb2
   862  	paddd		0xb0-0x100(%rcx),$xb3
   863  
   864  	movdqa		$xa0,0x00(%rsp)		# offload $xaN
   865  	movdqa		$xa1,0x10(%rsp)
   866  	movdqa		0x20(%rsp),$xa0		# "xc2"
   867  	movdqa		0x30(%rsp),$xa1		# "xc3"
   868  
   869  	movdqa		$xb0,$xt2
   870  	punpckldq	$xb1,$xb0
   871  	movdqa		$xb2,$xt3
   872  	punpckldq	$xb3,$xb2
   873  	punpckhdq	$xb1,$xt2
   874  	punpckhdq	$xb3,$xt3
   875  	movdqa		$xb0,$xb1
   876  	punpcklqdq	$xb2,$xb0		# "b0"
   877  	movdqa		$xt2,$xb3
   878  	punpcklqdq	$xt3,$xt2		# "b2"
   879  	punpckhqdq	$xb2,$xb1		# "b1"
   880  	punpckhqdq	$xt3,$xb3		# "b3"
   881  ___
   882  	($xb2,$xt2)=($xt2,$xb2);
   883  	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
   884  $code.=<<___;
   885  	paddd		0xc0-0x100(%rcx),$xc0
   886  	paddd		0xd0-0x100(%rcx),$xc1
   887  	paddd		0xe0-0x100(%rcx),$xc2
   888  	paddd		0xf0-0x100(%rcx),$xc3
   889  
   890  	movdqa		$xa2,0x20(%rsp)		# keep offloading $xaN
   891  	movdqa		$xa3,0x30(%rsp)
   892  
   893  	movdqa		$xc0,$xt2
   894  	punpckldq	$xc1,$xc0
   895  	movdqa		$xc2,$xt3
   896  	punpckldq	$xc3,$xc2
   897  	punpckhdq	$xc1,$xt2
   898  	punpckhdq	$xc3,$xt3
   899  	movdqa		$xc0,$xc1
   900  	punpcklqdq	$xc2,$xc0		# "c0"
   901  	movdqa		$xt2,$xc3
   902  	punpcklqdq	$xt3,$xt2		# "c2"
   903  	punpckhqdq	$xc2,$xc1		# "c1"
   904  	punpckhqdq	$xt3,$xc3		# "c3"
   905  ___
   906  	($xc2,$xt2)=($xt2,$xc2);
   907  	($xt0,$xt1)=($xa2,$xa3);		# use $xaN as temporary
   908  $code.=<<___;
   909  	paddd		0x100-0x100(%rcx),$xd0
   910  	paddd		0x110-0x100(%rcx),$xd1
   911  	paddd		0x120-0x100(%rcx),$xd2
   912  	paddd		0x130-0x100(%rcx),$xd3
   913  
   914  	movdqa		$xd0,$xt2
   915  	punpckldq	$xd1,$xd0
   916  	movdqa		$xd2,$xt3
   917  	punpckldq	$xd3,$xd2
   918  	punpckhdq	$xd1,$xt2
   919  	punpckhdq	$xd3,$xt3
   920  	movdqa		$xd0,$xd1
   921  	punpcklqdq	$xd2,$xd0		# "d0"
   922  	movdqa		$xt2,$xd3
   923  	punpcklqdq	$xt3,$xt2		# "d2"
   924  	punpckhqdq	$xd2,$xd1		# "d1"
   925  	punpckhqdq	$xt3,$xd3		# "d3"
   926  ___
   927  	($xd2,$xt2)=($xt2,$xd2);
   928  $code.=<<___;
   929  	cmp		\$64*4,$len
   930  	jb		.Ltail4x
   931  
   932  	movdqu		0x00($inp),$xt0		# xor with input
   933  	movdqu		0x10($inp),$xt1
   934  	movdqu		0x20($inp),$xt2
   935  	movdqu		0x30($inp),$xt3
   936  	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
   937  	pxor		$xb0,$xt1
   938  	pxor		$xc0,$xt2
   939  	pxor		$xd0,$xt3
   940  
   941  	 movdqu		$xt0,0x00($out)
   942  	movdqu		0x40($inp),$xt0
   943  	 movdqu		$xt1,0x10($out)
   944  	movdqu		0x50($inp),$xt1
   945  	 movdqu		$xt2,0x20($out)
   946  	movdqu		0x60($inp),$xt2
   947  	 movdqu		$xt3,0x30($out)
   948  	movdqu		0x70($inp),$xt3
   949  	lea		0x80($inp),$inp		# size optimization
   950  	pxor		0x10(%rsp),$xt0
   951  	pxor		$xb1,$xt1
   952  	pxor		$xc1,$xt2
   953  	pxor		$xd1,$xt3
   954  
   955  	 movdqu		$xt0,0x40($out)
   956  	movdqu		0x00($inp),$xt0
   957  	 movdqu		$xt1,0x50($out)
   958  	movdqu		0x10($inp),$xt1
   959  	 movdqu		$xt2,0x60($out)
   960  	movdqu		0x20($inp),$xt2
   961  	 movdqu		$xt3,0x70($out)
   962  	 lea		0x80($out),$out		# size optimization
   963  	movdqu		0x30($inp),$xt3
   964  	pxor		0x20(%rsp),$xt0
   965  	pxor		$xb2,$xt1
   966  	pxor		$xc2,$xt2
   967  	pxor		$xd2,$xt3
   968  
   969  	 movdqu		$xt0,0x00($out)
   970  	movdqu		0x40($inp),$xt0
   971  	 movdqu		$xt1,0x10($out)
   972  	movdqu		0x50($inp),$xt1
   973  	 movdqu		$xt2,0x20($out)
   974  	movdqu		0x60($inp),$xt2
   975  	 movdqu		$xt3,0x30($out)
   976  	movdqu		0x70($inp),$xt3
   977  	lea		0x80($inp),$inp		# inp+=64*4
   978  	pxor		0x30(%rsp),$xt0
   979  	pxor		$xb3,$xt1
   980  	pxor		$xc3,$xt2
   981  	pxor		$xd3,$xt3
   982  	movdqu		$xt0,0x40($out)
   983  	movdqu		$xt1,0x50($out)
   984  	movdqu		$xt2,0x60($out)
   985  	movdqu		$xt3,0x70($out)
   986  	lea		0x80($out),$out		# out+=64*4
   987  
   988  	sub		\$64*4,$len
   989  	jnz		.Loop_outer4x
   990  
   991  	jmp		.Ldone4x
   992  
   993  .Ltail4x:
   994  	cmp		\$192,$len
   995  	jae		.L192_or_more4x
   996  	cmp		\$128,$len
   997  	jae		.L128_or_more4x
   998  	cmp		\$64,$len
   999  	jae		.L64_or_more4x
  1000  
  1001  	#movdqa		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
  1002  	xor		%r10,%r10
  1003  	#movdqa		$xt0,0x00(%rsp)
  1004  	movdqa		$xb0,0x10(%rsp)
  1005  	movdqa		$xc0,0x20(%rsp)
  1006  	movdqa		$xd0,0x30(%rsp)
  1007  	jmp		.Loop_tail4x
  1008  
  1009  .align	32
  1010  .L64_or_more4x:
  1011  	movdqu		0x00($inp),$xt0		# xor with input
  1012  	movdqu		0x10($inp),$xt1
  1013  	movdqu		0x20($inp),$xt2
  1014  	movdqu		0x30($inp),$xt3
  1015  	pxor		0x00(%rsp),$xt0		# $xaxN is offloaded, remember?
  1016  	pxor		$xb0,$xt1
  1017  	pxor		$xc0,$xt2
  1018  	pxor		$xd0,$xt3
  1019  	movdqu		$xt0,0x00($out)
  1020  	movdqu		$xt1,0x10($out)
  1021  	movdqu		$xt2,0x20($out)
  1022  	movdqu		$xt3,0x30($out)
  1023  	je		.Ldone4x
  1024  
  1025  	movdqa		0x10(%rsp),$xt0		# $xaN is offloaded, remember?
  1026  	lea		0x40($inp),$inp		# inp+=64*1
  1027  	xor		%r10,%r10
  1028  	movdqa		$xt0,0x00(%rsp)
  1029  	movdqa		$xb1,0x10(%rsp)
  1030  	lea		0x40($out),$out		# out+=64*1
  1031  	movdqa		$xc1,0x20(%rsp)
  1032  	sub		\$64,$len		# len-=64*1
  1033  	movdqa		$xd1,0x30(%rsp)
  1034  	jmp		.Loop_tail4x
  1035  
  1036  .align	32
  1037  .L128_or_more4x:
  1038  	movdqu		0x00($inp),$xt0		# xor with input
  1039  	movdqu		0x10($inp),$xt1
  1040  	movdqu		0x20($inp),$xt2
  1041  	movdqu		0x30($inp),$xt3
  1042  	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
  1043  	pxor		$xb0,$xt1
  1044  	pxor		$xc0,$xt2
  1045  	pxor		$xd0,$xt3
  1046  
  1047  	 movdqu		$xt0,0x00($out)
  1048  	movdqu		0x40($inp),$xt0
  1049  	 movdqu		$xt1,0x10($out)
  1050  	movdqu		0x50($inp),$xt1
  1051  	 movdqu		$xt2,0x20($out)
  1052  	movdqu		0x60($inp),$xt2
  1053  	 movdqu		$xt3,0x30($out)
  1054  	movdqu		0x70($inp),$xt3
  1055  	pxor		0x10(%rsp),$xt0
  1056  	pxor		$xb1,$xt1
  1057  	pxor		$xc1,$xt2
  1058  	pxor		$xd1,$xt3
  1059  	movdqu		$xt0,0x40($out)
  1060  	movdqu		$xt1,0x50($out)
  1061  	movdqu		$xt2,0x60($out)
  1062  	movdqu		$xt3,0x70($out)
  1063  	je		.Ldone4x
  1064  
  1065  	movdqa		0x20(%rsp),$xt0		# $xaN is offloaded, remember?
  1066  	lea		0x80($inp),$inp		# inp+=64*2
  1067  	xor		%r10,%r10
  1068  	movdqa		$xt0,0x00(%rsp)
  1069  	movdqa		$xb2,0x10(%rsp)
  1070  	lea		0x80($out),$out		# out+=64*2
  1071  	movdqa		$xc2,0x20(%rsp)
  1072  	sub		\$128,$len		# len-=64*2
  1073  	movdqa		$xd2,0x30(%rsp)
  1074  	jmp		.Loop_tail4x
  1075  
  1076  .align	32
  1077  .L192_or_more4x:
  1078  	movdqu		0x00($inp),$xt0		# xor with input
  1079  	movdqu		0x10($inp),$xt1
  1080  	movdqu		0x20($inp),$xt2
  1081  	movdqu		0x30($inp),$xt3
  1082  	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
  1083  	pxor		$xb0,$xt1
  1084  	pxor		$xc0,$xt2
  1085  	pxor		$xd0,$xt3
  1086  
  1087  	 movdqu		$xt0,0x00($out)
  1088  	movdqu		0x40($inp),$xt0
  1089  	 movdqu		$xt1,0x10($out)
  1090  	movdqu		0x50($inp),$xt1
  1091  	 movdqu		$xt2,0x20($out)
  1092  	movdqu		0x60($inp),$xt2
  1093  	 movdqu		$xt3,0x30($out)
  1094  	movdqu		0x70($inp),$xt3
  1095  	lea		0x80($inp),$inp		# size optimization
  1096  	pxor		0x10(%rsp),$xt0
  1097  	pxor		$xb1,$xt1
  1098  	pxor		$xc1,$xt2
  1099  	pxor		$xd1,$xt3
  1100  
  1101  	 movdqu		$xt0,0x40($out)
  1102  	movdqu		0x00($inp),$xt0
  1103  	 movdqu		$xt1,0x50($out)
  1104  	movdqu		0x10($inp),$xt1
  1105  	 movdqu		$xt2,0x60($out)
  1106  	movdqu		0x20($inp),$xt2
  1107  	 movdqu		$xt3,0x70($out)
  1108  	 lea		0x80($out),$out		# size optimization
  1109  	movdqu		0x30($inp),$xt3
  1110  	pxor		0x20(%rsp),$xt0
  1111  	pxor		$xb2,$xt1
  1112  	pxor		$xc2,$xt2
  1113  	pxor		$xd2,$xt3
  1114  	movdqu		$xt0,0x00($out)
  1115  	movdqu		$xt1,0x10($out)
  1116  	movdqu		$xt2,0x20($out)
  1117  	movdqu		$xt3,0x30($out)
  1118  	je		.Ldone4x
  1119  
  1120  	movdqa		0x30(%rsp),$xt0		# $xaN is offloaded, remember?
  1121  	lea		0x40($inp),$inp		# inp+=64*3
  1122  	xor		%r10,%r10
  1123  	movdqa		$xt0,0x00(%rsp)
  1124  	movdqa		$xb3,0x10(%rsp)
  1125  	lea		0x40($out),$out		# out+=64*3
  1126  	movdqa		$xc3,0x20(%rsp)
  1127  	sub		\$192,$len		# len-=64*3
  1128  	movdqa		$xd3,0x30(%rsp)
  1129  
  1130  .Loop_tail4x:
  1131  	movzb		($inp,%r10),%eax
  1132  	movzb		(%rsp,%r10),%ecx
  1133  	lea		1(%r10),%r10
  1134  	xor		%ecx,%eax
  1135  	mov		%al,-1($out,%r10)
  1136  	dec		$len
  1137  	jnz		.Loop_tail4x
  1138  
  1139  .Ldone4x:
  1140  ___
  1141  $code.=<<___	if ($win64);
  1142  	movaps		-0xa8(%r9),%xmm6
  1143  	movaps		-0x98(%r9),%xmm7
  1144  	movaps		-0x88(%r9),%xmm8
  1145  	movaps		-0x78(%r9),%xmm9
  1146  	movaps		-0x68(%r9),%xmm10
  1147  	movaps		-0x58(%r9),%xmm11
  1148  	movaps		-0x48(%r9),%xmm12
  1149  	movaps		-0x38(%r9),%xmm13
  1150  	movaps		-0x28(%r9),%xmm14
  1151  	movaps		-0x18(%r9),%xmm15
  1152  ___
  1153  $code.=<<___;
  1154  	lea		(%r9),%rsp
  1155  .cfi_def_cfa_register	rsp
  1156  .L4x_epilogue:
  1157  	ret
  1158  .cfi_endproc
  1159  .size	ChaCha20_4x,.-ChaCha20_4x
  1160  ___
  1161  }
  1162  
  1163  ########################################################################
  1164  # AVX2 code path
  1165  if ($avx>1) {
  1166  my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
  1167      $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
  1168  my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
  1169  	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
  1170  
  1171  sub AVX2_lane_ROUND {
  1172  my ($a0,$b0,$c0,$d0)=@_;
  1173  my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  1174  my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  1175  my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  1176  my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
  1177  my @x=map("\"$_\"",@xx);
  1178  
  1179  	# Consider order in which variables are addressed by their
  1180  	# index:
  1181  	#
  1182  	#	a   b   c   d
  1183  	#
  1184  	#	0   4   8  12 < even round
  1185  	#	1   5   9  13
  1186  	#	2   6  10  14
  1187  	#	3   7  11  15
  1188  	#	0   5  10  15 < odd round
  1189  	#	1   6  11  12
  1190  	#	2   7   8  13
  1191  	#	3   4   9  14
  1192  	#
  1193  	# 'a', 'b' and 'd's are permanently allocated in registers,
  1194  	# @x[0..7,12..15], while 'c's are maintained in memory. If
  1195  	# you observe 'c' column, you'll notice that pair of 'c's is
  1196  	# invariant between rounds. This means that we have to reload
  1197  	# them once per round, in the middle. This is why you'll see
  1198  	# bunch of 'c' stores and loads in the middle, but none in
  1199  	# the beginning or end.
  1200  
  1201  	(
  1202  	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
  1203  	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
  1204  	"&vpshufb	(@x[$d0],@x[$d0],$t1)",
  1205  	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
  1206  	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
  1207  	 "&vpshufb	(@x[$d1],@x[$d1],$t1)",
  1208  
  1209  	"&vpaddd	($xc,$xc,@x[$d0])",
  1210  	"&vpxor		(@x[$b0],$xc,@x[$b0])",
  1211  	"&vpslld	($t0,@x[$b0],12)",
  1212  	"&vpsrld	(@x[$b0],@x[$b0],20)",
  1213  	"&vpor		(@x[$b0],$t0,@x[$b0])",
  1214  	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
  1215  	 "&vpaddd	($xc_,$xc_,@x[$d1])",
  1216  	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
  1217  	 "&vpslld	($t1,@x[$b1],12)",
  1218  	 "&vpsrld	(@x[$b1],@x[$b1],20)",
  1219  	 "&vpor		(@x[$b1],$t1,@x[$b1])",
  1220  
  1221  	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
  1222  	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
  1223  	"&vpshufb	(@x[$d0],@x[$d0],$t0)",
  1224  	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
  1225  	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
  1226  	 "&vpshufb	(@x[$d1],@x[$d1],$t0)",
  1227  
  1228  	"&vpaddd	($xc,$xc,@x[$d0])",
  1229  	"&vpxor		(@x[$b0],$xc,@x[$b0])",
  1230  	"&vpslld	($t1,@x[$b0],7)",
  1231  	"&vpsrld	(@x[$b0],@x[$b0],25)",
  1232  	"&vpor		(@x[$b0],$t1,@x[$b0])",
  1233  	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
  1234  	 "&vpaddd	($xc_,$xc_,@x[$d1])",
  1235  	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
  1236  	 "&vpslld	($t0,@x[$b1],7)",
  1237  	 "&vpsrld	(@x[$b1],@x[$b1],25)",
  1238  	 "&vpor		(@x[$b1],$t0,@x[$b1])",
  1239  
  1240  	"&vmovdqa	(\"`32*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
  1241  	 "&vmovdqa	(\"`32*($c1-8)`(%rsp)\",$xc_)",
  1242  	"&vmovdqa	($xc,\"`32*($c2-8)`(%rsp)\")",
  1243  	 "&vmovdqa	($xc_,\"`32*($c3-8)`(%rsp)\")",
  1244  
  1245  	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
  1246  	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
  1247  	"&vpshufb	(@x[$d2],@x[$d2],$t1)",
  1248  	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
  1249  	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
  1250  	 "&vpshufb	(@x[$d3],@x[$d3],$t1)",
  1251  
  1252  	"&vpaddd	($xc,$xc,@x[$d2])",
  1253  	"&vpxor		(@x[$b2],$xc,@x[$b2])",
  1254  	"&vpslld	($t0,@x[$b2],12)",
  1255  	"&vpsrld	(@x[$b2],@x[$b2],20)",
  1256  	"&vpor		(@x[$b2],$t0,@x[$b2])",
  1257  	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
  1258  	 "&vpaddd	($xc_,$xc_,@x[$d3])",
  1259  	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
  1260  	 "&vpslld	($t1,@x[$b3],12)",
  1261  	 "&vpsrld	(@x[$b3],@x[$b3],20)",
  1262  	 "&vpor		(@x[$b3],$t1,@x[$b3])",
  1263  
  1264  	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
  1265  	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
  1266  	"&vpshufb	(@x[$d2],@x[$d2],$t0)",
  1267  	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
  1268  	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
  1269  	 "&vpshufb	(@x[$d3],@x[$d3],$t0)",
  1270  
  1271  	"&vpaddd	($xc,$xc,@x[$d2])",
  1272  	"&vpxor		(@x[$b2],$xc,@x[$b2])",
  1273  	"&vpslld	($t1,@x[$b2],7)",
  1274  	"&vpsrld	(@x[$b2],@x[$b2],25)",
  1275  	"&vpor		(@x[$b2],$t1,@x[$b2])",
  1276  	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
  1277  	 "&vpaddd	($xc_,$xc_,@x[$d3])",
  1278  	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
  1279  	 "&vpslld	($t0,@x[$b3],7)",
  1280  	 "&vpsrld	(@x[$b3],@x[$b3],25)",
  1281  	 "&vpor		(@x[$b3],$t0,@x[$b3])"
  1282  	);
  1283  }
  1284  
  1285  my $xframe = $win64 ? 0xa8 : 8;
  1286  
  1287  $code.=<<___;
  1288  .type	ChaCha20_8x,\@function,5
  1289  .align	32
  1290  ChaCha20_8x:
  1291  .LChaCha20_8x:
  1292  .cfi_startproc
  1293  	mov		%rsp,%r9		# frame register
  1294  .cfi_def_cfa_register	r9
  1295  	sub		\$0x280+$xframe,%rsp
  1296  	and		\$-32,%rsp
  1297  ___
  1298  $code.=<<___	if ($win64);
  1299  	movaps		%xmm6,-0xa8(%r9)
  1300  	movaps		%xmm7,-0x98(%r9)
  1301  	movaps		%xmm8,-0x88(%r9)
  1302  	movaps		%xmm9,-0x78(%r9)
  1303  	movaps		%xmm10,-0x68(%r9)
  1304  	movaps		%xmm11,-0x58(%r9)
  1305  	movaps		%xmm12,-0x48(%r9)
  1306  	movaps		%xmm13,-0x38(%r9)
  1307  	movaps		%xmm14,-0x28(%r9)
  1308  	movaps		%xmm15,-0x18(%r9)
  1309  .L8x_body:
  1310  ___
  1311  $code.=<<___;
  1312  	vzeroupper
  1313  
  1314  	################ stack layout
  1315  	# +0x00		SIMD equivalent of @x[8-12]
  1316  	# ...
  1317  	# +0x80		constant copy of key[0-2] smashed by lanes
  1318  	# ...
  1319  	# +0x200	SIMD counters (with nonce smashed by lanes)
  1320  	# ...
  1321  	# +0x280
  1322  
  1323  	vbroadcasti128	.Lsigma(%rip),$xa3	# key[0]
  1324  	vbroadcasti128	($key),$xb3		# key[1]
  1325  	vbroadcasti128	16($key),$xt3		# key[2]
  1326  	vbroadcasti128	($counter),$xd3		# key[3]
  1327  	lea		0x100(%rsp),%rcx	# size optimization
  1328  	lea		0x200(%rsp),%rax	# size optimization
  1329  	lea		.Lrot16(%rip),%r10
  1330  	lea		.Lrot24(%rip),%r11
  1331  
  1332  	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
  1333  	vpshufd		\$0x55,$xa3,$xa1
  1334  	vmovdqa		$xa0,0x80-0x100(%rcx)	# ... and offload
  1335  	vpshufd		\$0xaa,$xa3,$xa2
  1336  	vmovdqa		$xa1,0xa0-0x100(%rcx)
  1337  	vpshufd		\$0xff,$xa3,$xa3
  1338  	vmovdqa		$xa2,0xc0-0x100(%rcx)
  1339  	vmovdqa		$xa3,0xe0-0x100(%rcx)
  1340  
  1341  	vpshufd		\$0x00,$xb3,$xb0
  1342  	vpshufd		\$0x55,$xb3,$xb1
  1343  	vmovdqa		$xb0,0x100-0x100(%rcx)
  1344  	vpshufd		\$0xaa,$xb3,$xb2
  1345  	vmovdqa		$xb1,0x120-0x100(%rcx)
  1346  	vpshufd		\$0xff,$xb3,$xb3
  1347  	vmovdqa		$xb2,0x140-0x100(%rcx)
  1348  	vmovdqa		$xb3,0x160-0x100(%rcx)
  1349  
  1350  	vpshufd		\$0x00,$xt3,$xt0	# "xc0"
  1351  	vpshufd		\$0x55,$xt3,$xt1	# "xc1"
  1352  	vmovdqa		$xt0,0x180-0x200(%rax)
  1353  	vpshufd		\$0xaa,$xt3,$xt2	# "xc2"
  1354  	vmovdqa		$xt1,0x1a0-0x200(%rax)
  1355  	vpshufd		\$0xff,$xt3,$xt3	# "xc3"
  1356  	vmovdqa		$xt2,0x1c0-0x200(%rax)
  1357  	vmovdqa		$xt3,0x1e0-0x200(%rax)
  1358  
  1359  	vpshufd		\$0x00,$xd3,$xd0
  1360  	vpshufd		\$0x55,$xd3,$xd1
  1361  	vpaddd		.Lincy(%rip),$xd0,$xd0	# don't save counters yet
  1362  	vpshufd		\$0xaa,$xd3,$xd2
  1363  	vmovdqa		$xd1,0x220-0x200(%rax)
  1364  	vpshufd		\$0xff,$xd3,$xd3
  1365  	vmovdqa		$xd2,0x240-0x200(%rax)
  1366  	vmovdqa		$xd3,0x260-0x200(%rax)
  1367  
  1368  	jmp		.Loop_enter8x
  1369  
  1370  .align	32
  1371  .Loop_outer8x:
  1372  	vmovdqa		0x80-0x100(%rcx),$xa0	# re-load smashed key
  1373  	vmovdqa		0xa0-0x100(%rcx),$xa1
  1374  	vmovdqa		0xc0-0x100(%rcx),$xa2
  1375  	vmovdqa		0xe0-0x100(%rcx),$xa3
  1376  	vmovdqa		0x100-0x100(%rcx),$xb0
  1377  	vmovdqa		0x120-0x100(%rcx),$xb1
  1378  	vmovdqa		0x140-0x100(%rcx),$xb2
  1379  	vmovdqa		0x160-0x100(%rcx),$xb3
  1380  	vmovdqa		0x180-0x200(%rax),$xt0	# "xc0"
  1381  	vmovdqa		0x1a0-0x200(%rax),$xt1	# "xc1"
  1382  	vmovdqa		0x1c0-0x200(%rax),$xt2	# "xc2"
  1383  	vmovdqa		0x1e0-0x200(%rax),$xt3	# "xc3"
  1384  	vmovdqa		0x200-0x200(%rax),$xd0
  1385  	vmovdqa		0x220-0x200(%rax),$xd1
  1386  	vmovdqa		0x240-0x200(%rax),$xd2
  1387  	vmovdqa		0x260-0x200(%rax),$xd3
  1388  	vpaddd		.Leight(%rip),$xd0,$xd0	# next SIMD counters
  1389  
  1390  .Loop_enter8x:
  1391  	vmovdqa		$xt2,0x40(%rsp)		# SIMD equivalent of "@x[10]"
  1392  	vmovdqa		$xt3,0x60(%rsp)		# SIMD equivalent of "@x[11]"
  1393  	vbroadcasti128	(%r10),$xt3
  1394  	vmovdqa		$xd0,0x200-0x200(%rax)	# save SIMD counters
  1395  	mov		\$10,%eax
  1396  	jmp		.Loop8x
  1397  
  1398  .align	32
  1399  .Loop8x:
  1400  ___
  1401  	foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
  1402  	foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
  1403  $code.=<<___;
  1404  	dec		%eax
  1405  	jnz		.Loop8x
  1406  
  1407  	lea		0x200(%rsp),%rax	# size optimization
  1408  	vpaddd		0x80-0x100(%rcx),$xa0,$xa0	# accumulate key
  1409  	vpaddd		0xa0-0x100(%rcx),$xa1,$xa1
  1410  	vpaddd		0xc0-0x100(%rcx),$xa2,$xa2
  1411  	vpaddd		0xe0-0x100(%rcx),$xa3,$xa3
  1412  
  1413  	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
  1414  	vpunpckldq	$xa3,$xa2,$xt3
  1415  	vpunpckhdq	$xa1,$xa0,$xa0
  1416  	vpunpckhdq	$xa3,$xa2,$xa2
  1417  	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
  1418  	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
  1419  	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
  1420  	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
  1421  ___
  1422  	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
  1423  $code.=<<___;
  1424  	vpaddd		0x100-0x100(%rcx),$xb0,$xb0
  1425  	vpaddd		0x120-0x100(%rcx),$xb1,$xb1
  1426  	vpaddd		0x140-0x100(%rcx),$xb2,$xb2
  1427  	vpaddd		0x160-0x100(%rcx),$xb3,$xb3
  1428  
  1429  	vpunpckldq	$xb1,$xb0,$xt2
  1430  	vpunpckldq	$xb3,$xb2,$xt3
  1431  	vpunpckhdq	$xb1,$xb0,$xb0
  1432  	vpunpckhdq	$xb3,$xb2,$xb2
  1433  	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
  1434  	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
  1435  	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
  1436  	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
  1437  ___
  1438  	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
  1439  $code.=<<___;
  1440  	vperm2i128	\$0x20,$xb0,$xa0,$xt3	# "de-interlace" further
  1441  	vperm2i128	\$0x31,$xb0,$xa0,$xb0
  1442  	vperm2i128	\$0x20,$xb1,$xa1,$xa0
  1443  	vperm2i128	\$0x31,$xb1,$xa1,$xb1
  1444  	vperm2i128	\$0x20,$xb2,$xa2,$xa1
  1445  	vperm2i128	\$0x31,$xb2,$xa2,$xb2
  1446  	vperm2i128	\$0x20,$xb3,$xa3,$xa2
  1447  	vperm2i128	\$0x31,$xb3,$xa3,$xb3
  1448  ___
  1449  	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
  1450  	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
  1451  $code.=<<___;
  1452  	vmovdqa		$xa0,0x00(%rsp)		# offload $xaN
  1453  	vmovdqa		$xa1,0x20(%rsp)
  1454  	vmovdqa		0x40(%rsp),$xc2		# $xa0
  1455  	vmovdqa		0x60(%rsp),$xc3		# $xa1
  1456  
  1457  	vpaddd		0x180-0x200(%rax),$xc0,$xc0
  1458  	vpaddd		0x1a0-0x200(%rax),$xc1,$xc1
  1459  	vpaddd		0x1c0-0x200(%rax),$xc2,$xc2
  1460  	vpaddd		0x1e0-0x200(%rax),$xc3,$xc3
  1461  
  1462  	vpunpckldq	$xc1,$xc0,$xt2
  1463  	vpunpckldq	$xc3,$xc2,$xt3
  1464  	vpunpckhdq	$xc1,$xc0,$xc0
  1465  	vpunpckhdq	$xc3,$xc2,$xc2
  1466  	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
  1467  	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
  1468  	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
  1469  	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
  1470  ___
  1471  	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
  1472  $code.=<<___;
  1473  	vpaddd		0x200-0x200(%rax),$xd0,$xd0
  1474  	vpaddd		0x220-0x200(%rax),$xd1,$xd1
  1475  	vpaddd		0x240-0x200(%rax),$xd2,$xd2
  1476  	vpaddd		0x260-0x200(%rax),$xd3,$xd3
  1477  
  1478  	vpunpckldq	$xd1,$xd0,$xt2
  1479  	vpunpckldq	$xd3,$xd2,$xt3
  1480  	vpunpckhdq	$xd1,$xd0,$xd0
  1481  	vpunpckhdq	$xd3,$xd2,$xd2
  1482  	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
  1483  	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
  1484  	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
  1485  	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
  1486  ___
  1487  	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
  1488  $code.=<<___;
  1489  	vperm2i128	\$0x20,$xd0,$xc0,$xt3	# "de-interlace" further
  1490  	vperm2i128	\$0x31,$xd0,$xc0,$xd0
  1491  	vperm2i128	\$0x20,$xd1,$xc1,$xc0
  1492  	vperm2i128	\$0x31,$xd1,$xc1,$xd1
  1493  	vperm2i128	\$0x20,$xd2,$xc2,$xc1
  1494  	vperm2i128	\$0x31,$xd2,$xc2,$xd2
  1495  	vperm2i128	\$0x20,$xd3,$xc3,$xc2
  1496  	vperm2i128	\$0x31,$xd3,$xc3,$xd3
  1497  ___
  1498  	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
  1499  	($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
  1500  	($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
  1501  	($xa0,$xa1)=($xt2,$xt3);
  1502  $code.=<<___;
  1503  	vmovdqa		0x00(%rsp),$xa0		# $xaN was offloaded, remember?
  1504  	vmovdqa		0x20(%rsp),$xa1
  1505  
  1506  	cmp		\$64*8,$len
  1507  	jb		.Ltail8x
  1508  
  1509  	vpxor		0x00($inp),$xa0,$xa0	# xor with input
  1510  	vpxor		0x20($inp),$xb0,$xb0
  1511  	vpxor		0x40($inp),$xc0,$xc0
  1512  	vpxor		0x60($inp),$xd0,$xd0
  1513  	lea		0x80($inp),$inp		# size optimization
  1514  	vmovdqu		$xa0,0x00($out)
  1515  	vmovdqu		$xb0,0x20($out)
  1516  	vmovdqu		$xc0,0x40($out)
  1517  	vmovdqu		$xd0,0x60($out)
  1518  	lea		0x80($out),$out		# size optimization
  1519  
  1520  	vpxor		0x00($inp),$xa1,$xa1
  1521  	vpxor		0x20($inp),$xb1,$xb1
  1522  	vpxor		0x40($inp),$xc1,$xc1
  1523  	vpxor		0x60($inp),$xd1,$xd1
  1524  	lea		0x80($inp),$inp		# size optimization
  1525  	vmovdqu		$xa1,0x00($out)
  1526  	vmovdqu		$xb1,0x20($out)
  1527  	vmovdqu		$xc1,0x40($out)
  1528  	vmovdqu		$xd1,0x60($out)
  1529  	lea		0x80($out),$out		# size optimization
  1530  
  1531  	vpxor		0x00($inp),$xa2,$xa2
  1532  	vpxor		0x20($inp),$xb2,$xb2
  1533  	vpxor		0x40($inp),$xc2,$xc2
  1534  	vpxor		0x60($inp),$xd2,$xd2
  1535  	lea		0x80($inp),$inp		# size optimization
  1536  	vmovdqu		$xa2,0x00($out)
  1537  	vmovdqu		$xb2,0x20($out)
  1538  	vmovdqu		$xc2,0x40($out)
  1539  	vmovdqu		$xd2,0x60($out)
  1540  	lea		0x80($out),$out		# size optimization
  1541  
  1542  	vpxor		0x00($inp),$xa3,$xa3
  1543  	vpxor		0x20($inp),$xb3,$xb3
  1544  	vpxor		0x40($inp),$xc3,$xc3
  1545  	vpxor		0x60($inp),$xd3,$xd3
  1546  	lea		0x80($inp),$inp		# size optimization
  1547  	vmovdqu		$xa3,0x00($out)
  1548  	vmovdqu		$xb3,0x20($out)
  1549  	vmovdqu		$xc3,0x40($out)
  1550  	vmovdqu		$xd3,0x60($out)
  1551  	lea		0x80($out),$out		# size optimization
  1552  
  1553  	sub		\$64*8,$len
  1554  	jnz		.Loop_outer8x
  1555  
  1556  	jmp		.Ldone8x
  1557  
  1558  .Ltail8x:
  1559  	cmp		\$448,$len
  1560  	jae		.L448_or_more8x
  1561  	cmp		\$384,$len
  1562  	jae		.L384_or_more8x
  1563  	cmp		\$320,$len
  1564  	jae		.L320_or_more8x
  1565  	cmp		\$256,$len
  1566  	jae		.L256_or_more8x
  1567  	cmp		\$192,$len
  1568  	jae		.L192_or_more8x
  1569  	cmp		\$128,$len
  1570  	jae		.L128_or_more8x
  1571  	cmp		\$64,$len
  1572  	jae		.L64_or_more8x
  1573  
  1574  	xor		%r10,%r10
  1575  	vmovdqa		$xa0,0x00(%rsp)
  1576  	vmovdqa		$xb0,0x20(%rsp)
  1577  	jmp		.Loop_tail8x
  1578  
  1579  .align	32
  1580  .L64_or_more8x:
  1581  	vpxor		0x00($inp),$xa0,$xa0	# xor with input
  1582  	vpxor		0x20($inp),$xb0,$xb0
  1583  	vmovdqu		$xa0,0x00($out)
  1584  	vmovdqu		$xb0,0x20($out)
  1585  	je		.Ldone8x
  1586  
  1587  	lea		0x40($inp),$inp		# inp+=64*1
  1588  	xor		%r10,%r10
  1589  	vmovdqa		$xc0,0x00(%rsp)
  1590  	lea		0x40($out),$out		# out+=64*1
  1591  	sub		\$64,$len		# len-=64*1
  1592  	vmovdqa		$xd0,0x20(%rsp)
  1593  	jmp		.Loop_tail8x
  1594  
  1595  .align	32
  1596  .L128_or_more8x:
  1597  	vpxor		0x00($inp),$xa0,$xa0	# xor with input
  1598  	vpxor		0x20($inp),$xb0,$xb0
  1599  	vpxor		0x40($inp),$xc0,$xc0
  1600  	vpxor		0x60($inp),$xd0,$xd0
  1601  	vmovdqu		$xa0,0x00($out)
  1602  	vmovdqu		$xb0,0x20($out)
  1603  	vmovdqu		$xc0,0x40($out)
  1604  	vmovdqu		$xd0,0x60($out)
  1605  	je		.Ldone8x
  1606  
  1607  	lea		0x80($inp),$inp		# inp+=64*2
  1608  	xor		%r10,%r10
  1609  	vmovdqa		$xa1,0x00(%rsp)
  1610  	lea		0x80($out),$out		# out+=64*2
  1611  	sub		\$128,$len		# len-=64*2
  1612  	vmovdqa		$xb1,0x20(%rsp)
  1613  	jmp		.Loop_tail8x
  1614  
  1615  .align	32
  1616  .L192_or_more8x:
  1617  	vpxor		0x00($inp),$xa0,$xa0	# xor with input
  1618  	vpxor		0x20($inp),$xb0,$xb0
  1619  	vpxor		0x40($inp),$xc0,$xc0
  1620  	vpxor		0x60($inp),$xd0,$xd0
  1621  	vpxor		0x80($inp),$xa1,$xa1
  1622  	vpxor		0xa0($inp),$xb1,$xb1
  1623  	vmovdqu		$xa0,0x00($out)
  1624  	vmovdqu		$xb0,0x20($out)
  1625  	vmovdqu		$xc0,0x40($out)
  1626  	vmovdqu		$xd0,0x60($out)
  1627  	vmovdqu		$xa1,0x80($out)
  1628  	vmovdqu		$xb1,0xa0($out)
  1629  	je		.Ldone8x
  1630  
  1631  	lea		0xc0($inp),$inp		# inp+=64*3
  1632  	xor		%r10,%r10
  1633  	vmovdqa		$xc1,0x00(%rsp)
  1634  	lea		0xc0($out),$out		# out+=64*3
  1635  	sub		\$192,$len		# len-=64*3
  1636  	vmovdqa		$xd1,0x20(%rsp)
  1637  	jmp		.Loop_tail8x
  1638  
  1639  .align	32
  1640  .L256_or_more8x:
  1641  	vpxor		0x00($inp),$xa0,$xa0	# xor with input
  1642  	vpxor		0x20($inp),$xb0,$xb0
  1643  	vpxor		0x40($inp),$xc0,$xc0
  1644  	vpxor		0x60($inp),$xd0,$xd0
  1645  	vpxor		0x80($inp),$xa1,$xa1
  1646  	vpxor		0xa0($inp),$xb1,$xb1
  1647  	vpxor		0xc0($inp),$xc1,$xc1
  1648  	vpxor		0xe0($inp),$xd1,$xd1
  1649  	vmovdqu		$xa0,0x00($out)
  1650  	vmovdqu		$xb0,0x20($out)
  1651  	vmovdqu		$xc0,0x40($out)
  1652  	vmovdqu		$xd0,0x60($out)
  1653  	vmovdqu		$xa1,0x80($out)
  1654  	vmovdqu		$xb1,0xa0($out)
  1655  	vmovdqu		$xc1,0xc0($out)
  1656  	vmovdqu		$xd1,0xe0($out)
  1657  	je		.Ldone8x
  1658  
  1659  	lea		0x100($inp),$inp	# inp+=64*4
  1660  	xor		%r10,%r10
  1661  	vmovdqa		$xa2,0x00(%rsp)
  1662  	lea		0x100($out),$out	# out+=64*4
  1663  	sub		\$256,$len		# len-=64*4
  1664  	vmovdqa		$xb2,0x20(%rsp)
  1665  	jmp		.Loop_tail8x
  1666  
  1667  .align	32
  1668  .L320_or_more8x:
  1669  	vpxor		0x00($inp),$xa0,$xa0	# xor with input
  1670  	vpxor		0x20($inp),$xb0,$xb0
  1671  	vpxor		0x40($inp),$xc0,$xc0
  1672  	vpxor		0x60($inp),$xd0,$xd0
  1673  	vpxor		0x80($inp),$xa1,$xa1
  1674  	vpxor		0xa0($inp),$xb1,$xb1
  1675  	vpxor		0xc0($inp),$xc1,$xc1
  1676  	vpxor		0xe0($inp),$xd1,$xd1
  1677  	vpxor		0x100($inp),$xa2,$xa2
  1678  	vpxor		0x120($inp),$xb2,$xb2
  1679  	vmovdqu		$xa0,0x00($out)
  1680  	vmovdqu		$xb0,0x20($out)
  1681  	vmovdqu		$xc0,0x40($out)
  1682  	vmovdqu		$xd0,0x60($out)
  1683  	vmovdqu		$xa1,0x80($out)
  1684  	vmovdqu		$xb1,0xa0($out)
  1685  	vmovdqu		$xc1,0xc0($out)
  1686  	vmovdqu		$xd1,0xe0($out)
  1687  	vmovdqu		$xa2,0x100($out)
  1688  	vmovdqu		$xb2,0x120($out)
  1689  	je		.Ldone8x
  1690  
  1691  	lea		0x140($inp),$inp	# inp+=64*5
  1692  	xor		%r10,%r10
  1693  	vmovdqa		$xc2,0x00(%rsp)
  1694  	lea		0x140($out),$out	# out+=64*5
  1695  	sub		\$320,$len		# len-=64*5
  1696  	vmovdqa		$xd2,0x20(%rsp)
  1697  	jmp		.Loop_tail8x
  1698  
  1699  .align	32
  1700  .L384_or_more8x:
  1701  	vpxor		0x00($inp),$xa0,$xa0	# xor with input
  1702  	vpxor		0x20($inp),$xb0,$xb0
  1703  	vpxor		0x40($inp),$xc0,$xc0
  1704  	vpxor		0x60($inp),$xd0,$xd0
  1705  	vpxor		0x80($inp),$xa1,$xa1
  1706  	vpxor		0xa0($inp),$xb1,$xb1
  1707  	vpxor		0xc0($inp),$xc1,$xc1
  1708  	vpxor		0xe0($inp),$xd1,$xd1
  1709  	vpxor		0x100($inp),$xa2,$xa2
  1710  	vpxor		0x120($inp),$xb2,$xb2
  1711  	vpxor		0x140($inp),$xc2,$xc2
  1712  	vpxor		0x160($inp),$xd2,$xd2
  1713  	vmovdqu		$xa0,0x00($out)
  1714  	vmovdqu		$xb0,0x20($out)
  1715  	vmovdqu		$xc0,0x40($out)
  1716  	vmovdqu		$xd0,0x60($out)
  1717  	vmovdqu		$xa1,0x80($out)
  1718  	vmovdqu		$xb1,0xa0($out)
  1719  	vmovdqu		$xc1,0xc0($out)
  1720  	vmovdqu		$xd1,0xe0($out)
  1721  	vmovdqu		$xa2,0x100($out)
  1722  	vmovdqu		$xb2,0x120($out)
  1723  	vmovdqu		$xc2,0x140($out)
  1724  	vmovdqu		$xd2,0x160($out)
  1725  	je		.Ldone8x
  1726  
  1727  	lea		0x180($inp),$inp	# inp+=64*6
  1728  	xor		%r10,%r10
  1729  	vmovdqa		$xa3,0x00(%rsp)
  1730  	lea		0x180($out),$out	# out+=64*6
  1731  	sub		\$384,$len		# len-=64*6
  1732  	vmovdqa		$xb3,0x20(%rsp)
  1733  	jmp		.Loop_tail8x
  1734  
  1735  .align	32
  1736  .L448_or_more8x:
  1737  	vpxor		0x00($inp),$xa0,$xa0	# xor with input
  1738  	vpxor		0x20($inp),$xb0,$xb0
  1739  	vpxor		0x40($inp),$xc0,$xc0
  1740  	vpxor		0x60($inp),$xd0,$xd0
  1741  	vpxor		0x80($inp),$xa1,$xa1
  1742  	vpxor		0xa0($inp),$xb1,$xb1
  1743  	vpxor		0xc0($inp),$xc1,$xc1
  1744  	vpxor		0xe0($inp),$xd1,$xd1
  1745  	vpxor		0x100($inp),$xa2,$xa2
  1746  	vpxor		0x120($inp),$xb2,$xb2
  1747  	vpxor		0x140($inp),$xc2,$xc2
  1748  	vpxor		0x160($inp),$xd2,$xd2
  1749  	vpxor		0x180($inp),$xa3,$xa3
  1750  	vpxor		0x1a0($inp),$xb3,$xb3
  1751  	vmovdqu		$xa0,0x00($out)
  1752  	vmovdqu		$xb0,0x20($out)
  1753  	vmovdqu		$xc0,0x40($out)
  1754  	vmovdqu		$xd0,0x60($out)
  1755  	vmovdqu		$xa1,0x80($out)
  1756  	vmovdqu		$xb1,0xa0($out)
  1757  	vmovdqu		$xc1,0xc0($out)
  1758  	vmovdqu		$xd1,0xe0($out)
  1759  	vmovdqu		$xa2,0x100($out)
  1760  	vmovdqu		$xb2,0x120($out)
  1761  	vmovdqu		$xc2,0x140($out)
  1762  	vmovdqu		$xd2,0x160($out)
  1763  	vmovdqu		$xa3,0x180($out)
  1764  	vmovdqu		$xb3,0x1a0($out)
  1765  	je		.Ldone8x
  1766  
  1767  	lea		0x1c0($inp),$inp	# inp+=64*7
  1768  	xor		%r10,%r10
  1769  	vmovdqa		$xc3,0x00(%rsp)
  1770  	lea		0x1c0($out),$out	# out+=64*7
  1771  	sub		\$448,$len		# len-=64*7
  1772  	vmovdqa		$xd3,0x20(%rsp)
  1773  
  1774  .Loop_tail8x:
  1775  	movzb		($inp,%r10),%eax
  1776  	movzb		(%rsp,%r10),%ecx
  1777  	lea		1(%r10),%r10
  1778  	xor		%ecx,%eax
  1779  	mov		%al,-1($out,%r10)
  1780  	dec		$len
  1781  	jnz		.Loop_tail8x
  1782  
  1783  .Ldone8x:
  1784  	vzeroall
  1785  ___
  1786  $code.=<<___	if ($win64);
  1787  	movaps		-0xa8(%r9),%xmm6
  1788  	movaps		-0x98(%r9),%xmm7
  1789  	movaps		-0x88(%r9),%xmm8
  1790  	movaps		-0x78(%r9),%xmm9
  1791  	movaps		-0x68(%r9),%xmm10
  1792  	movaps		-0x58(%r9),%xmm11
  1793  	movaps		-0x48(%r9),%xmm12
  1794  	movaps		-0x38(%r9),%xmm13
  1795  	movaps		-0x28(%r9),%xmm14
  1796  	movaps		-0x18(%r9),%xmm15
  1797  ___
  1798  $code.=<<___;
  1799  	lea		(%r9),%rsp
  1800  .cfi_def_cfa_register	rsp
  1801  .L8x_epilogue:
  1802  	ret
  1803  .cfi_endproc
  1804  .size	ChaCha20_8x,.-ChaCha20_8x
  1805  ___
  1806  }
  1807  
  1808  ########################################################################
  1809  # AVX512 code paths were removed
  1810  
  1811  # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1812  #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1813  if ($win64) {
  1814  $rec="%rcx";
  1815  $frame="%rdx";
  1816  $context="%r8";
  1817  $disp="%r9";
  1818  
  1819  $code.=<<___;
  1820  .extern	__imp_RtlVirtualUnwind
  1821  .type	se_handler,\@abi-omnipotent
  1822  .align	16
  1823  se_handler:
  1824  	push	%rsi
  1825  	push	%rdi
  1826  	push	%rbx
  1827  	push	%rbp
  1828  	push	%r12
  1829  	push	%r13
  1830  	push	%r14
  1831  	push	%r15
  1832  	pushfq
  1833  	sub	\$64,%rsp
  1834  
  1835  	mov	120($context),%rax	# pull context->Rax
  1836  	mov	248($context),%rbx	# pull context->Rip
  1837  
  1838  	mov	8($disp),%rsi		# disp->ImageBase
  1839  	mov	56($disp),%r11		# disp->HandlerData
  1840  
  1841  	lea	.Lctr32_body(%rip),%r10
  1842  	cmp	%r10,%rbx		# context->Rip<.Lprologue
  1843  	jb	.Lcommon_seh_tail
  1844  
  1845  	mov	152($context),%rax	# pull context->Rsp
  1846  
  1847  	lea	.Lno_data(%rip),%r10	# epilogue label
  1848  	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
  1849  	jae	.Lcommon_seh_tail
  1850  
  1851  	lea	64+24+48(%rax),%rax
  1852  
  1853  	mov	-8(%rax),%rbx
  1854  	mov	-16(%rax),%rbp
  1855  	mov	-24(%rax),%r12
  1856  	mov	-32(%rax),%r13
  1857  	mov	-40(%rax),%r14
  1858  	mov	-48(%rax),%r15
  1859  	mov	%rbx,144($context)	# restore context->Rbx
  1860  	mov	%rbp,160($context)	# restore context->Rbp
  1861  	mov	%r12,216($context)	# restore context->R12
  1862  	mov	%r13,224($context)	# restore context->R13
  1863  	mov	%r14,232($context)	# restore context->R14
  1864  	mov	%r15,240($context)	# restore context->R14
  1865  
  1866  .Lcommon_seh_tail:
  1867  	mov	8(%rax),%rdi
  1868  	mov	16(%rax),%rsi
  1869  	mov	%rax,152($context)	# restore context->Rsp
  1870  	mov	%rsi,168($context)	# restore context->Rsi
  1871  	mov	%rdi,176($context)	# restore context->Rdi
  1872  
  1873  	mov	40($disp),%rdi		# disp->ContextRecord
  1874  	mov	$context,%rsi		# context
  1875  	mov	\$154,%ecx		# sizeof(CONTEXT)
  1876  	.long	0xa548f3fc		# cld; rep movsq
  1877  
  1878  	mov	$disp,%rsi
  1879  	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
  1880  	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
  1881  	mov	0(%rsi),%r8		# arg3, disp->ControlPc
  1882  	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
  1883  	mov	40(%rsi),%r10		# disp->ContextRecord
  1884  	lea	56(%rsi),%r11		# &disp->HandlerData
  1885  	lea	24(%rsi),%r12		# &disp->EstablisherFrame
  1886  	mov	%r10,32(%rsp)		# arg5
  1887  	mov	%r11,40(%rsp)		# arg6
  1888  	mov	%r12,48(%rsp)		# arg7
  1889  	mov	%rcx,56(%rsp)		# arg8, (NULL)
  1890  	call	*__imp_RtlVirtualUnwind(%rip)
  1891  
  1892  	mov	\$1,%eax		# ExceptionContinueSearch
  1893  	add	\$64,%rsp
  1894  	popfq
  1895  	pop	%r15
  1896  	pop	%r14
  1897  	pop	%r13
  1898  	pop	%r12
  1899  	pop	%rbp
  1900  	pop	%rbx
  1901  	pop	%rdi
  1902  	pop	%rsi
  1903  	ret
  1904  .size	se_handler,.-se_handler
  1905  
  1906  .type	ssse3_handler,\@abi-omnipotent
  1907  .align	16
  1908  ssse3_handler:
  1909  	push	%rsi
  1910  	push	%rdi
  1911  	push	%rbx
  1912  	push	%rbp
  1913  	push	%r12
  1914  	push	%r13
  1915  	push	%r14
  1916  	push	%r15
  1917  	pushfq
  1918  	sub	\$64,%rsp
  1919  
  1920  	mov	120($context),%rax	# pull context->Rax
  1921  	mov	248($context),%rbx	# pull context->Rip
  1922  
  1923  	mov	8($disp),%rsi		# disp->ImageBase
  1924  	mov	56($disp),%r11		# disp->HandlerData
  1925  
  1926  	mov	0(%r11),%r10d		# HandlerData[0]
  1927  	lea	(%rsi,%r10),%r10	# prologue label
  1928  	cmp	%r10,%rbx		# context->Rip<prologue label
  1929  	jb	.Lcommon_seh_tail
  1930  
  1931  	mov	192($context),%rax	# pull context->R9
  1932  
  1933  	mov	4(%r11),%r10d		# HandlerData[1]
  1934  	lea	(%rsi,%r10),%r10	# epilogue label
  1935  	cmp	%r10,%rbx		# context->Rip>=epilogue label
  1936  	jae	.Lcommon_seh_tail
  1937  
  1938  	lea	-0x28(%rax),%rsi
  1939  	lea	512($context),%rdi	# &context.Xmm6
  1940  	mov	\$4,%ecx
  1941  	.long	0xa548f3fc		# cld; rep movsq
  1942  
  1943  	jmp	.Lcommon_seh_tail
  1944  .size	ssse3_handler,.-ssse3_handler
  1945  
  1946  .type	full_handler,\@abi-omnipotent
  1947  .align	16
  1948  full_handler:
  1949  	push	%rsi
  1950  	push	%rdi
  1951  	push	%rbx
  1952  	push	%rbp
  1953  	push	%r12
  1954  	push	%r13
  1955  	push	%r14
  1956  	push	%r15
  1957  	pushfq
  1958  	sub	\$64,%rsp
  1959  
  1960  	mov	120($context),%rax	# pull context->Rax
  1961  	mov	248($context),%rbx	# pull context->Rip
  1962  
  1963  	mov	8($disp),%rsi		# disp->ImageBase
  1964  	mov	56($disp),%r11		# disp->HandlerData
  1965  
  1966  	mov	0(%r11),%r10d		# HandlerData[0]
  1967  	lea	(%rsi,%r10),%r10	# prologue label
  1968  	cmp	%r10,%rbx		# context->Rip<prologue label
  1969  	jb	.Lcommon_seh_tail
  1970  
  1971  	mov	192($context),%rax	# pull context->R9
  1972  
  1973  	mov	4(%r11),%r10d		# HandlerData[1]
  1974  	lea	(%rsi,%r10),%r10	# epilogue label
  1975  	cmp	%r10,%rbx		# context->Rip>=epilogue label
  1976  	jae	.Lcommon_seh_tail
  1977  
  1978  	lea	-0xa8(%rax),%rsi
  1979  	lea	512($context),%rdi	# &context.Xmm6
  1980  	mov	\$20,%ecx
  1981  	.long	0xa548f3fc		# cld; rep movsq
  1982  
  1983  	jmp	.Lcommon_seh_tail
  1984  .size	full_handler,.-full_handler
  1985  
  1986  .section	.pdata
  1987  .align	4
  1988  	.rva	.LSEH_begin_ChaCha20_ctr32
  1989  	.rva	.LSEH_end_ChaCha20_ctr32
  1990  	.rva	.LSEH_info_ChaCha20_ctr32
  1991  
  1992  	.rva	.LSEH_begin_ChaCha20_ssse3
  1993  	.rva	.LSEH_end_ChaCha20_ssse3
  1994  	.rva	.LSEH_info_ChaCha20_ssse3
  1995  
  1996  	.rva	.LSEH_begin_ChaCha20_4x
  1997  	.rva	.LSEH_end_ChaCha20_4x
  1998  	.rva	.LSEH_info_ChaCha20_4x
  1999  ___
  2000  $code.=<<___ if ($avx>1);
  2001  	.rva	.LSEH_begin_ChaCha20_8x
  2002  	.rva	.LSEH_end_ChaCha20_8x
  2003  	.rva	.LSEH_info_ChaCha20_8x
  2004  ___
  2005  $code.=<<___;
  2006  .section	.xdata
  2007  .align	8
  2008  .LSEH_info_ChaCha20_ctr32:
  2009  	.byte	9,0,0,0
  2010  	.rva	se_handler
  2011  
  2012  .LSEH_info_ChaCha20_ssse3:
  2013  	.byte	9,0,0,0
  2014  	.rva	ssse3_handler
  2015  	.rva	.Lssse3_body,.Lssse3_epilogue
  2016  
  2017  .LSEH_info_ChaCha20_4x:
  2018  	.byte	9,0,0,0
  2019  	.rva	full_handler
  2020  	.rva	.L4x_body,.L4x_epilogue
  2021  ___
  2022  $code.=<<___ if ($avx>1);
  2023  .LSEH_info_ChaCha20_8x:
  2024  	.byte	9,0,0,0
  2025  	.rva	full_handler
  2026  	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
  2027  ___
  2028  }
  2029  
  2030  foreach (split("\n",$code)) {
  2031  	s/\`([^\`]*)\`/eval $1/ge;
  2032  
  2033  	s/%x#%[yz]/%x/g;	# "down-shift"
  2034  
  2035  	print $_,"\n";
  2036  }
  2037  
  2038  close STDOUT or die "error closing STDOUT: $!";