git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/asm/chacha_x86.pl (about)

     1  #! /usr/bin/env perl
     2  # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
     3  #
     4  # Licensed under the OpenSSL license (the "License").  You may not use
     5  # this file except in compliance with the License.  You can obtain a copy
     6  # in the file LICENSE in the source distribution or at
     7  # https://www.openssl.org/source/license.html
     8  
     9  #
    10  # ====================================================================
    11  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    12  # project. The module is, however, dual licensed under OpenSSL and
    13  # CRYPTOGAMS licenses depending on where you obtain it. For further
    14  # details see http://www.openssl.org/~appro/cryptogams/.
    15  # ====================================================================
    16  #
    17  # January 2015
    18  #
    19  # ChaCha20 for x86.
    20  #
    21  # Performance in cycles per byte out of large buffer.
    22  #
    23  #		1xIALU/gcc	4xSSSE3
    24  # Pentium	17.5/+80%
    25  # PIII		14.2/+60%
    26  # P4		18.6/+84%
    27  # Core2		9.56/+89%	4.83
    28  # Westmere	9.50/+45%	3.35
    29  # Sandy Bridge	10.5/+47%	3.20
    30  # Haswell	8.15/+50%	2.83
    31  # Skylake	7.53/+22%	2.75
    32  # Silvermont	17.4/+36%	8.35
    33  # Goldmont	13.4/+40%	4.36
    34  # Sledgehammer	10.2/+54%
    35  # Bulldozer	13.4/+50%	4.38(*)
    36  #
    37  # (*)	Bulldozer actually executes 4xXOP code path that delivers 3.55;
    38  #
    39  # Modified from upstream OpenSSL to remove the XOP code.
    40  
    41  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    42  push(@INC,"${dir}","${dir}../../perlasm");
    43  require "x86asm.pl";
    44  
    45  $output=pop;
    46  open STDOUT,">$output";
    47  
    48  &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
    49  
    50  $xmm=$ymm=1;
    51  $gasver=999;  # enable everything
    52  
    53  $a="eax";
    54  ($b,$b_)=("ebx","ebp");
    55  ($c,$c_)=("ecx","esi");
    56  ($d,$d_)=("edx","edi");
    57  
    58  sub QUARTERROUND {
    59  my ($ai,$bi,$ci,$di,$i)=@_;
    60  my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
    61  my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
    62  
    63  	#       a   b   c   d
    64  	#
    65  	#       0   4   8  12 < even round
    66  	#       1   5   9  13
    67  	#       2   6  10  14
    68  	#       3   7  11  15
    69  	#       0   5  10  15 < odd round
    70  	#       1   6  11  12
    71  	#       2   7   8  13
    72  	#       3   4   9  14
    73  
    74  	if ($i==0) {
    75              my $j=4;
    76  	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
    77  	} elsif ($i==3) {
    78              my $j=0;
    79  	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
    80  	} elsif ($i==4) {
    81              my $j=4;
    82  	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
    83  	} elsif ($i==7) {
    84              my $j=0;
    85  	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
    86  	}
    87  
    88  	#&add	($a,$b);			# see elsewhere
    89  	&xor	($d,$a);
    90  	 &mov	(&DWP(4*$cp,"esp"),$c_)		if ($ai>0 && $ai<3);
    91  	&rol	($d,16);
    92  	 &mov	(&DWP(4*$bp,"esp"),$b_)		if ($i!=0);
    93  	&add	($c,$d);
    94  	 &mov	($c_,&DWP(4*$cn,"esp"))		if ($ai>0 && $ai<3);
    95  	&xor	($b,$c);
    96  	 &mov	($d_,&DWP(4*$dn,"esp"))		if ($di!=$dn);
    97  	&rol	($b,12);
    98  	 &mov	($b_,&DWP(4*$bn,"esp"))		if ($i<7);
    99  	 &mov	($b_,&DWP(128,"esp"))		if ($i==7);	# loop counter
   100  	&add	($a,$b);
   101  	&xor	($d,$a);
   102  	&mov	(&DWP(4*$ai,"esp"),$a);
   103  	&rol	($d,8);
   104  	&mov	($a,&DWP(4*$an,"esp"));
   105  	&add	($c,$d);
   106  	&mov	(&DWP(4*$di,"esp"),$d)		if ($di!=$dn);
   107  	&mov	($d_,$d)			if ($di==$dn);
   108  	&xor	($b,$c);
   109  	 &add	($a,$b_)			if ($i<7);	# elsewhere
   110  	&rol	($b,7);
   111  
   112  	($b,$b_)=($b_,$b);
   113  	($c,$c_)=($c_,$c);
   114  	($d,$d_)=($d_,$d);
   115  }
   116  
   117  &static_label("ssse3_shortcut");
   118  &static_label("ssse3_data");
   119  &static_label("pic_point");
   120  
   121  &function_begin("ChaCha20_ctr32");
   122  	&xor	("eax","eax");
   123  	&cmp	("eax",&wparam(2));		# len==0?
   124  	&je	(&label("no_data"));
   125  if ($xmm) {
   126  	&call	(&label("pic_point"));
   127  &set_label("pic_point");
   128  	&blindpop("eax");
   129  	&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
   130  	&test	(&DWP(0,"ebp"),1<<24);		# test FXSR bit
   131  	&jz	(&label("x86"));
   132  	&test	(&DWP(4,"ebp"),1<<9);		# test SSSE3 bit
   133  	&jz	(&label("x86"));
   134  	&jmp	(&label("ssse3_shortcut"));
   135  &set_label("x86");
   136  }
   137  	&mov	("esi",&wparam(3));		# key
   138  	&mov	("edi",&wparam(4));		# counter and nonce
   139  
   140  	&stack_push(33);
   141  
   142  	&mov	("eax",&DWP(4*0,"esi"));	# copy key
   143  	&mov	("ebx",&DWP(4*1,"esi"));
   144  	&mov	("ecx",&DWP(4*2,"esi"));
   145  	&mov	("edx",&DWP(4*3,"esi"));
   146  	&mov	(&DWP(64+4*4,"esp"),"eax");
   147  	&mov	(&DWP(64+4*5,"esp"),"ebx");
   148  	&mov	(&DWP(64+4*6,"esp"),"ecx");
   149  	&mov	(&DWP(64+4*7,"esp"),"edx");
   150  	&mov	("eax",&DWP(4*4,"esi"));
   151  	&mov	("ebx",&DWP(4*5,"esi"));
   152  	&mov	("ecx",&DWP(4*6,"esi"));
   153  	&mov	("edx",&DWP(4*7,"esi"));
   154  	&mov	(&DWP(64+4*8,"esp"),"eax");
   155  	&mov	(&DWP(64+4*9,"esp"),"ebx");
   156  	&mov	(&DWP(64+4*10,"esp"),"ecx");
   157  	&mov	(&DWP(64+4*11,"esp"),"edx");
   158  	&mov	("eax",&DWP(4*0,"edi"));	# copy counter and nonce
   159  	&mov	("ebx",&DWP(4*1,"edi"));
   160  	&mov	("ecx",&DWP(4*2,"edi"));
   161  	&mov	("edx",&DWP(4*3,"edi"));
   162  	&sub	("eax",1);
   163  	&mov	(&DWP(64+4*12,"esp"),"eax");
   164  	&mov	(&DWP(64+4*13,"esp"),"ebx");
   165  	&mov	(&DWP(64+4*14,"esp"),"ecx");
   166  	&mov	(&DWP(64+4*15,"esp"),"edx");
   167  	&jmp	(&label("entry"));
   168  
   169  &set_label("outer_loop",16);
   170  	&mov	(&wparam(1),$b);		# save input
   171  	&mov	(&wparam(0),$a);		# save output
   172  	&mov	(&wparam(2),$c);		# save len
   173  &set_label("entry");
   174  	&mov	($a,0x61707865);
   175  	&mov	(&DWP(4*1,"esp"),0x3320646e);
   176  	&mov	(&DWP(4*2,"esp"),0x79622d32);
   177  	&mov	(&DWP(4*3,"esp"),0x6b206574);
   178  
   179  	&mov	($b, &DWP(64+4*5,"esp"));	# copy key material
   180  	&mov	($b_,&DWP(64+4*6,"esp"));
   181  	&mov	($c, &DWP(64+4*10,"esp"));
   182  	&mov	($c_,&DWP(64+4*11,"esp"));
   183  	&mov	($d, &DWP(64+4*13,"esp"));
   184  	&mov	($d_,&DWP(64+4*14,"esp"));
   185  	&mov	(&DWP(4*5,"esp"),$b);
   186  	&mov	(&DWP(4*6,"esp"),$b_);
   187  	&mov	(&DWP(4*10,"esp"),$c);
   188  	&mov	(&DWP(4*11,"esp"),$c_);
   189  	&mov	(&DWP(4*13,"esp"),$d);
   190  	&mov	(&DWP(4*14,"esp"),$d_);
   191  
   192  	&mov	($b, &DWP(64+4*7,"esp"));
   193  	&mov	($d_,&DWP(64+4*15,"esp"));
   194  	&mov	($d, &DWP(64+4*12,"esp"));
   195  	&mov	($b_,&DWP(64+4*4,"esp"));
   196  	&mov	($c, &DWP(64+4*8,"esp"));
   197  	&mov	($c_,&DWP(64+4*9,"esp"));
   198  	&add	($d,1);				# counter value
   199  	&mov	(&DWP(4*7,"esp"),$b);
   200  	&mov	(&DWP(4*15,"esp"),$d_);
   201  	&mov	(&DWP(64+4*12,"esp"),$d);	# save counter value
   202  
   203  	&mov	($b,10);			# loop counter
   204  	&jmp	(&label("loop"));
   205  
   206  &set_label("loop",16);
   207  	&add	($a,$b_);			# elsewhere
   208  	&mov	(&DWP(128,"esp"),$b);		# save loop counter
   209  	&mov	($b,$b_);
   210  	&QUARTERROUND(0, 4, 8, 12, 0);
   211  	&QUARTERROUND(1, 5, 9, 13, 1);
   212  	&QUARTERROUND(2, 6,10, 14, 2);
   213  	&QUARTERROUND(3, 7,11, 15, 3);
   214  	&QUARTERROUND(0, 5,10, 15, 4);
   215  	&QUARTERROUND(1, 6,11, 12, 5);
   216  	&QUARTERROUND(2, 7, 8, 13, 6);
   217  	&QUARTERROUND(3, 4, 9, 14, 7);
   218  	&dec	($b);
   219  	&jnz	(&label("loop"));
   220  
   221  	&mov	($b,&wparam(2));		# load len
   222  
   223  	&add	($a,0x61707865);		# accumulate key material
   224  	&add	($b_,&DWP(64+4*4,"esp"));
   225  	&add	($c, &DWP(64+4*8,"esp"));
   226  	&add	($c_,&DWP(64+4*9,"esp"));
   227  
   228  	&cmp	($b,64);
   229  	&jb	(&label("tail"));
   230  
   231  	&mov	($b,&wparam(1));		# load input pointer
   232  	&add	($d, &DWP(64+4*12,"esp"));
   233  	&add	($d_,&DWP(64+4*14,"esp"));
   234  
   235  	&xor	($a, &DWP(4*0,$b));		# xor with input
   236  	&xor	($b_,&DWP(4*4,$b));
   237  	&mov	(&DWP(4*0,"esp"),$a);
   238  	&mov	($a,&wparam(0));		# load output pointer
   239  	&xor	($c, &DWP(4*8,$b));
   240  	&xor	($c_,&DWP(4*9,$b));
   241  	&xor	($d, &DWP(4*12,$b));
   242  	&xor	($d_,&DWP(4*14,$b));
   243  	&mov	(&DWP(4*4,$a),$b_);		# write output
   244  	&mov	(&DWP(4*8,$a),$c);
   245  	&mov	(&DWP(4*9,$a),$c_);
   246  	&mov	(&DWP(4*12,$a),$d);
   247  	&mov	(&DWP(4*14,$a),$d_);
   248  
   249  	&mov	($b_,&DWP(4*1,"esp"));
   250  	&mov	($c, &DWP(4*2,"esp"));
   251  	&mov	($c_,&DWP(4*3,"esp"));
   252  	&mov	($d, &DWP(4*5,"esp"));
   253  	&mov	($d_,&DWP(4*6,"esp"));
   254  	&add	($b_,0x3320646e);		# accumulate key material
   255  	&add	($c, 0x79622d32);
   256  	&add	($c_,0x6b206574);
   257  	&add	($d, &DWP(64+4*5,"esp"));
   258  	&add	($d_,&DWP(64+4*6,"esp"));
   259  	&xor	($b_,&DWP(4*1,$b));
   260  	&xor	($c, &DWP(4*2,$b));
   261  	&xor	($c_,&DWP(4*3,$b));
   262  	&xor	($d, &DWP(4*5,$b));
   263  	&xor	($d_,&DWP(4*6,$b));
   264  	&mov	(&DWP(4*1,$a),$b_);
   265  	&mov	(&DWP(4*2,$a),$c);
   266  	&mov	(&DWP(4*3,$a),$c_);
   267  	&mov	(&DWP(4*5,$a),$d);
   268  	&mov	(&DWP(4*6,$a),$d_);
   269  
   270  	&mov	($b_,&DWP(4*7,"esp"));
   271  	&mov	($c, &DWP(4*10,"esp"));
   272  	&mov	($c_,&DWP(4*11,"esp"));
   273  	&mov	($d, &DWP(4*13,"esp"));
   274  	&mov	($d_,&DWP(4*15,"esp"));
   275  	&add	($b_,&DWP(64+4*7,"esp"));
   276  	&add	($c, &DWP(64+4*10,"esp"));
   277  	&add	($c_,&DWP(64+4*11,"esp"));
   278  	&add	($d, &DWP(64+4*13,"esp"));
   279  	&add	($d_,&DWP(64+4*15,"esp"));
   280  	&xor	($b_,&DWP(4*7,$b));
   281  	&xor	($c, &DWP(4*10,$b));
   282  	&xor	($c_,&DWP(4*11,$b));
   283  	&xor	($d, &DWP(4*13,$b));
   284  	&xor	($d_,&DWP(4*15,$b));
   285  	&lea	($b,&DWP(4*16,$b));
   286  	&mov	(&DWP(4*7,$a),$b_);
   287  	&mov	($b_,&DWP(4*0,"esp"));
   288  	&mov	(&DWP(4*10,$a),$c);
   289  	&mov	($c,&wparam(2));		# len
   290  	&mov	(&DWP(4*11,$a),$c_);
   291  	&mov	(&DWP(4*13,$a),$d);
   292  	&mov	(&DWP(4*15,$a),$d_);
   293  	&mov	(&DWP(4*0,$a),$b_);
   294  	&lea	($a,&DWP(4*16,$a));
   295  	&sub	($c,64);
   296  	&jnz	(&label("outer_loop"));
   297  
   298  	&jmp	(&label("done"));
   299  
   300  &set_label("tail");
   301  	&add	($d, &DWP(64+4*12,"esp"));
   302  	&add	($d_,&DWP(64+4*14,"esp"));
   303  	&mov	(&DWP(4*0,"esp"),$a);
   304  	&mov	(&DWP(4*4,"esp"),$b_);
   305  	&mov	(&DWP(4*8,"esp"),$c);
   306  	&mov	(&DWP(4*9,"esp"),$c_);
   307  	&mov	(&DWP(4*12,"esp"),$d);
   308  	&mov	(&DWP(4*14,"esp"),$d_);
   309  
   310  	&mov	($b_,&DWP(4*1,"esp"));
   311  	&mov	($c, &DWP(4*2,"esp"));
   312  	&mov	($c_,&DWP(4*3,"esp"));
   313  	&mov	($d, &DWP(4*5,"esp"));
   314  	&mov	($d_,&DWP(4*6,"esp"));
   315  	&add	($b_,0x3320646e);		# accumulate key material
   316  	&add	($c, 0x79622d32);
   317  	&add	($c_,0x6b206574);
   318  	&add	($d, &DWP(64+4*5,"esp"));
   319  	&add	($d_,&DWP(64+4*6,"esp"));
   320  	&mov	(&DWP(4*1,"esp"),$b_);
   321  	&mov	(&DWP(4*2,"esp"),$c);
   322  	&mov	(&DWP(4*3,"esp"),$c_);
   323  	&mov	(&DWP(4*5,"esp"),$d);
   324  	&mov	(&DWP(4*6,"esp"),$d_);
   325  
   326  	&mov	($b_,&DWP(4*7,"esp"));
   327  	&mov	($c, &DWP(4*10,"esp"));
   328  	&mov	($c_,&DWP(4*11,"esp"));
   329  	&mov	($d, &DWP(4*13,"esp"));
   330  	&mov	($d_,&DWP(4*15,"esp"));
   331  	&add	($b_,&DWP(64+4*7,"esp"));
   332  	&add	($c, &DWP(64+4*10,"esp"));
   333  	&add	($c_,&DWP(64+4*11,"esp"));
   334  	&add	($d, &DWP(64+4*13,"esp"));
   335  	&add	($d_,&DWP(64+4*15,"esp"));
   336  	&mov	(&DWP(4*7,"esp"),$b_);
   337  	&mov	($b_,&wparam(1));		# load input
   338  	&mov	(&DWP(4*10,"esp"),$c);
   339  	&mov	($c,&wparam(0));		# load output
   340  	&mov	(&DWP(4*11,"esp"),$c_);
   341  	&xor	($c_,$c_);
   342  	&mov	(&DWP(4*13,"esp"),$d);
   343  	&mov	(&DWP(4*15,"esp"),$d_);
   344  
   345  	&xor	("eax","eax");
   346  	&xor	("edx","edx");
   347  &set_label("tail_loop");
   348  	&movb	("al",&BP(0,$c_,$b_));
   349  	&movb	("dl",&BP(0,"esp",$c_));
   350  	&lea	($c_,&DWP(1,$c_));
   351  	&xor	("al","dl");
   352  	&mov	(&BP(-1,$c,$c_),"al");
   353  	&dec	($b);
   354  	&jnz	(&label("tail_loop"));
   355  
   356  &set_label("done");
   357  	&stack_pop(33);
   358  &set_label("no_data");
   359  &function_end("ChaCha20_ctr32");
   360  
   361  if ($xmm) {
   362  my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
   363  my ($out,$inp,$len)=("edi","esi","ecx");
   364  
   365  sub QUARTERROUND_SSSE3 {
   366  my ($ai,$bi,$ci,$di,$i)=@_;
   367  my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
   368  my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
   369  
   370  	#       a   b   c   d
   371  	#
   372  	#       0   4   8  12 < even round
   373  	#       1   5   9  13
   374  	#       2   6  10  14
   375  	#       3   7  11  15
   376  	#       0   5  10  15 < odd round
   377  	#       1   6  11  12
   378  	#       2   7   8  13
   379  	#       3   4   9  14
   380  
   381  	if ($i==0) {
   382              my $j=4;
   383  	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
   384  	} elsif ($i==3) {
   385              my $j=0;
   386  	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
   387  	} elsif ($i==4) {
   388              my $j=4;
   389  	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
   390  	} elsif ($i==7) {
   391              my $j=0;
   392  	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
   393  	}
   394  
   395  	#&paddd	($xa,$xb);			# see elsewhere
   396  	#&pxor	($xd,$xa);			# see elsewhere
   397  	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
   398  	&pshufb	($xd,&QWP(0,"eax"));		# rot16
   399  	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
   400  	&paddd	($xc,$xd);
   401  	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
   402  	&pxor	($xb,$xc);
   403  	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
   404  	&movdqa	($xa_,$xb);			# borrow as temporary
   405  	&pslld	($xb,12);
   406  	&psrld	($xa_,20);
   407  	&por	($xb,$xa_);
   408  	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
   409  	&paddd	($xa,$xb);
   410  	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
   411  	&pxor	($xd,$xa);
   412  	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
   413  	&pshufb	($xd,&QWP(16,"eax"));		# rot8
   414  	&paddd	($xc,$xd);
   415  	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
   416  	&movdqa	($xd_,$xd)			if ($di==$dn);
   417  	&pxor	($xb,$xc);
   418  	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
   419  	&movdqa	($xa,$xb);			# borrow as temporary
   420  	&pslld	($xb,7);
   421  	&psrld	($xa,25);
   422  	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
   423  	&por	($xb,$xa);
   424  
   425  	($xa,$xa_)=($xa_,$xa);
   426  	($xb,$xb_)=($xb_,$xb);
   427  	($xc,$xc_)=($xc_,$xc);
   428  	($xd,$xd_)=($xd_,$xd);
   429  }
   430  
   431  &function_begin("_ChaCha20_ssse3");
   432  &set_label("ssse3_shortcut");
   433  	&mov		($out,&wparam(0));
   434  	&mov		($inp,&wparam(1));
   435  	&mov		($len,&wparam(2));
   436  	&mov		("edx",&wparam(3));		# key
   437  	&mov		("ebx",&wparam(4));		# counter and nonce
   438  
   439  	&mov		("ebp","esp");
   440  	&stack_push	(131);
   441  	&and		("esp",-64);
   442  	&mov		(&DWP(512,"esp"),"ebp");
   443  
   444  	&lea		("eax",&DWP(&label("ssse3_data")."-".
   445  				    &label("pic_point"),"eax"));
   446  	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
   447  
   448  if (defined($gasver) && $gasver>=2.17) {		# even though we encode
   449  							# pshufb manually, we
   450  							# handle only register
   451  							# operands, while this
   452  							# segment uses memory
   453  							# operand...
   454  	&cmp		($len,64*4);
   455  	&jb		(&label("1x"));
   456  
   457  	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
   458  	&mov		(&DWP(512+8,"esp"),"ebx");
   459  	&sub		($len,64*4);			# bias len
   460  	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
   461  
   462  	&movdqu		("xmm7",&QWP(0,"edx"));		# key
   463  	&pshufd		("xmm0","xmm3",0x00);
   464  	&pshufd		("xmm1","xmm3",0x55);
   465  	&pshufd		("xmm2","xmm3",0xaa);
   466  	&pshufd		("xmm3","xmm3",0xff);
   467  	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
   468  	&pshufd		("xmm4","xmm7",0x00);
   469  	&pshufd		("xmm5","xmm7",0x55);
   470  	 &psubd		("xmm0",&QWP(16*4,"eax"));
   471  	&pshufd		("xmm6","xmm7",0xaa);
   472  	&pshufd		("xmm7","xmm7",0xff);
   473  	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
   474  	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
   475  	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
   476  	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
   477  	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
   478  	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
   479  	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
   480  	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
   481  	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
   482  	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
   483  	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
   484  
   485  	&pshufd		("xmm0","xmm3",0x00);
   486  	&pshufd		("xmm1","xmm3",0x55);
   487  	&pshufd		("xmm2","xmm3",0xaa);
   488  	&pshufd		("xmm3","xmm3",0xff);
   489  	&pshufd		("xmm4","xmm7",0x00);
   490  	&pshufd		("xmm5","xmm7",0x55);
   491  	&pshufd		("xmm6","xmm7",0xaa);
   492  	&pshufd		("xmm7","xmm7",0xff);
   493  	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
   494  	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
   495  	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
   496  	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
   497  	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
   498  	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
   499  	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
   500  	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
   501  
   502  	&lea		($inp,&DWP(128,$inp));		# size optimization
   503  	&lea		($out,&DWP(128,$out));		# size optimization
   504  	&jmp		(&label("outer_loop"));
   505  
   506  &set_label("outer_loop",16);
   507  	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
   508  	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
   509  	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
   510  	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
   511  	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
   512  	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
   513  	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
   514  	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
   515  	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
   516  	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
   517  	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
   518  	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
   519  	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
   520  	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
   521  	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
   522  	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
   523  	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
   524  	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
   525  	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
   526  	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
   527  	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
   528  	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
   529  	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
   530  	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
   531  	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
   532  	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
   533  	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
   534  	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
   535  	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
   536  	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
   537  	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
   538  	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
   539  	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
   540  	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
   541  
   542  	&movdqa		($xa, &QWP(16*0-128,"ebp"));
   543  	&movdqa		($xd, "xmm4");
   544  	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
   545  	&movdqa		($xc, &QWP(16*8-128,"ebp"));
   546  	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
   547  
   548  	&mov		("edx",10);			# loop counter
   549  	&nop		();
   550  
   551  &set_label("loop",16);
   552  	&paddd		($xa,$xb_);			# elsewhere
   553  	&movdqa		($xb,$xb_);
   554  	&pxor		($xd,$xa);			# elsewhere
   555  	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
   556  	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
   557  	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
   558  	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
   559  	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
   560  	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
   561  	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
   562  	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
   563  	&dec		("edx");
   564  	&jnz		(&label("loop"));
   565  
   566  	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
   567  	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
   568  	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
   569  	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
   570  	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
   571  
   572      my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
   573  
   574  	#&movdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
   575  	&movdqa		($xa1,&QWP(16*1-128,"ebx"));
   576  	&movdqa		($xa2,&QWP(16*2-128,"ebx"));
   577  	&movdqa		($xa3,&QWP(16*3-128,"ebx"));
   578  
   579      for($i=0;$i<256;$i+=64) {
   580  	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
   581  	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
   582  	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
   583  	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
   584  
   585  	&movdqa		($xt2,$xa0);		# "de-interlace" data
   586  	&punpckldq	($xa0,$xa1);
   587  	&movdqa		($xt3,$xa2);
   588  	&punpckldq	($xa2,$xa3);
   589  	&punpckhdq	($xt2,$xa1);
   590  	&punpckhdq	($xt3,$xa3);
   591  	&movdqa		($xa1,$xa0);
   592  	&punpcklqdq	($xa0,$xa2);		# "a0"
   593  	&movdqa		($xa3,$xt2);
   594  	&punpcklqdq	($xt2,$xt3);		# "a2"
   595  	&punpckhqdq	($xa1,$xa2);		# "a1"
   596  	&punpckhqdq	($xa3,$xt3);		# "a3"
   597  
   598  	#($xa2,$xt2)=($xt2,$xa2);
   599  
   600  	&movdqu		($xt0,&QWP(64*0-128,$inp));	# load input
   601  	&movdqu		($xt1,&QWP(64*1-128,$inp));
   602  	&movdqu		($xa2,&QWP(64*2-128,$inp));
   603  	&movdqu		($xt3,&QWP(64*3-128,$inp));
   604  	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
   605  	&pxor		($xt0,$xa0);
   606  	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
   607  	&pxor		($xt1,$xa1);
   608  	&movdqa		($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
   609  	&pxor		($xt2,$xa2);
   610  	&movdqa		($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
   611  	&pxor		($xt3,$xa3);
   612  	&movdqa		($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
   613  	&movdqu		(&QWP(64*0-128,$out),$xt0);	# store output
   614  	&movdqu		(&QWP(64*1-128,$out),$xt1);
   615  	&movdqu		(&QWP(64*2-128,$out),$xt2);
   616  	&movdqu		(&QWP(64*3-128,$out),$xt3);
   617  	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
   618      }
   619  	&sub		($len,64*4);
   620  	&jnc		(&label("outer_loop"));
   621  
   622  	&add		($len,64*4);
   623  	&jz		(&label("done"));
   624  
   625  	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
   626  	&lea		($inp,&DWP(-128,$inp));
   627  	&mov		("edx",&DWP(512+4,"esp"));
   628  	&lea		($out,&DWP(-128,$out));
   629  
   630  	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
   631  	&movdqu		("xmm3",&QWP(0,"ebx"));
   632  	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
   633  	&pand		("xmm3",&QWP(16*7,"eax"));
   634  	&por		("xmm3","xmm2");		# counter value
   635  }
   636  {
   637  my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
   638  
   639  sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
   640  	&paddd		($a,$b);
   641  	&pxor		($d,$a);
   642  	&pshufb		($d,$rot16);
   643  
   644  	&paddd		($c,$d);
   645  	&pxor		($b,$c);
   646  	&movdqa		($t,$b);
   647  	&psrld		($b,20);
   648  	&pslld		($t,12);
   649  	&por		($b,$t);
   650  
   651  	&paddd		($a,$b);
   652  	&pxor		($d,$a);
   653  	&pshufb		($d,$rot24);
   654  
   655  	&paddd		($c,$d);
   656  	&pxor		($b,$c);
   657  	&movdqa		($t,$b);
   658  	&psrld		($b,25);
   659  	&pslld		($t,7);
   660  	&por		($b,$t);
   661  }
   662  
   663  &set_label("1x");
   664  	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
   665  	&movdqu		($b,&QWP(0,"edx"));
   666  	&movdqu		($c,&QWP(16,"edx"));
   667  	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
   668  	&movdqa		($rot16,&QWP(0,"eax"));
   669  	&movdqa		($rot24,&QWP(16,"eax"));
   670  	&mov		(&DWP(16*3,"esp"),"ebp");
   671  
   672  	&movdqa		(&QWP(16*0,"esp"),$a);
   673  	&movdqa		(&QWP(16*1,"esp"),$b);
   674  	&movdqa		(&QWP(16*2,"esp"),$c);
   675  	&movdqa		(&QWP(16*3,"esp"),$d);
   676  	&mov		("edx",10);
   677  	&jmp		(&label("loop1x"));
   678  
   679  &set_label("outer1x",16);
   680  	&movdqa		($d,&QWP(16*5,"eax"));		# one
   681  	&movdqa		($a,&QWP(16*0,"esp"));
   682  	&movdqa		($b,&QWP(16*1,"esp"));
   683  	&movdqa		($c,&QWP(16*2,"esp"));
   684  	&paddd		($d,&QWP(16*3,"esp"));
   685  	&mov		("edx",10);
   686  	&movdqa		(&QWP(16*3,"esp"),$d);
   687  	&jmp		(&label("loop1x"));
   688  
   689  &set_label("loop1x",16);
   690  	&SSSE3ROUND();
   691  	&pshufd	($c,$c,0b01001110);
   692  	&pshufd	($b,$b,0b00111001);
   693  	&pshufd	($d,$d,0b10010011);
   694  	&nop	();
   695  
   696  	&SSSE3ROUND();
   697  	&pshufd	($c,$c,0b01001110);
   698  	&pshufd	($b,$b,0b10010011);
   699  	&pshufd	($d,$d,0b00111001);
   700  
   701  	&dec		("edx");
   702  	&jnz		(&label("loop1x"));
   703  
   704  	&paddd		($a,&QWP(16*0,"esp"));
   705  	&paddd		($b,&QWP(16*1,"esp"));
   706  	&paddd		($c,&QWP(16*2,"esp"));
   707  	&paddd		($d,&QWP(16*3,"esp"));
   708  
   709  	&cmp		($len,64);
   710  	&jb		(&label("tail"));
   711  
   712  	&movdqu		($t,&QWP(16*0,$inp));
   713  	&movdqu		($t1,&QWP(16*1,$inp));
   714  	&pxor		($a,$t);		# xor with input
   715  	&movdqu		($t,&QWP(16*2,$inp));
   716  	&pxor		($b,$t1);
   717  	&movdqu		($t1,&QWP(16*3,$inp));
   718  	&pxor		($c,$t);
   719  	&pxor		($d,$t1);
   720  	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
   721  
   722  	&movdqu		(&QWP(16*0,$out),$a);	# write output
   723  	&movdqu		(&QWP(16*1,$out),$b);
   724  	&movdqu		(&QWP(16*2,$out),$c);
   725  	&movdqu		(&QWP(16*3,$out),$d);
   726  	&lea		($out,&DWP(16*4,$out));	# inp+=64
   727  
   728  	&sub		($len,64);
   729  	&jnz		(&label("outer1x"));
   730  
   731  	&jmp		(&label("done"));
   732  
   733  &set_label("tail");
   734  	&movdqa		(&QWP(16*0,"esp"),$a);
   735  	&movdqa		(&QWP(16*1,"esp"),$b);
   736  	&movdqa		(&QWP(16*2,"esp"),$c);
   737  	&movdqa		(&QWP(16*3,"esp"),$d);
   738  
   739  	&xor		("eax","eax");
   740  	&xor		("edx","edx");
   741  	&xor		("ebp","ebp");
   742  
   743  &set_label("tail_loop");
   744  	&movb		("al",&BP(0,"esp","ebp"));
   745  	&movb		("dl",&BP(0,$inp,"ebp"));
   746  	&lea		("ebp",&DWP(1,"ebp"));
   747  	&xor		("al","dl");
   748  	&movb		(&BP(-1,$out,"ebp"),"al");
   749  	&dec		($len);
   750  	&jnz		(&label("tail_loop"));
   751  }
   752  &set_label("done");
   753  	&mov		("esp",&DWP(512,"esp"));
   754  &function_end("_ChaCha20_ssse3");
   755  
   756  &align	(64);
   757  &set_label("ssse3_data");
   758  &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
   759  &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
   760  &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
   761  &data_word(0,1,2,3);
   762  &data_word(4,4,4,4);
   763  &data_word(1,0,0,0);
   764  &data_word(4,0,0,0);
   765  &data_word(0,-1,-1,-1);
   766  &align	(64);
   767  }
   768  &asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
   769  
   770  &asm_finish();
   771  
   772  close STDOUT or die "error closing STDOUT: $!";