git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/asm/chacha20_poly1305_x86_64.pl (about) 1 #!/usr/bin/env perl 2 3 # Copyright (c) 2015, CloudFlare Ltd. 4 # 5 # Permission to use, copy, modify, and/or distribute this software for any 6 # purpose with or without fee is hereby granted, provided that the above 7 # copyright notice and this permission notice appear in all copies. 8 # 9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 16 17 ############################################################################## 18 # # 19 # Author: Vlad Krasnov # 20 # # 21 ############################################################################## 22 23 $flavour = shift; 24 $output = shift; 25 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 26 27 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 28 29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 31 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 32 die "can't locate x86_64-xlate.pl"; 33 34 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 35 *STDOUT=*OUT; 36 37 $avx = 2; 38 39 $code.=<<___; 40 .text 41 .extern OPENSSL_ia32cap_P 42 43 chacha20_poly1305_constants: 44 45 .section .rodata 46 .align 64 47 .Lchacha20_consts: 48 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 49 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 50 .Lrol8: 51 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 52 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 53 .Lrol16: 54 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 55 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 56 .Lavx2_init: 57 .long 0,0,0,0 58 .Lsse_inc: 59 .long 1,0,0,0 60 .Lavx2_inc: 61 .long 2,0,0,0,2,0,0,0 62 .Lclamp: 63 .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 64 .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF 65 .align 16 66 .Land_masks: 67 .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 68 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 69 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 70 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 71 .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 72 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 73 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 74 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 75 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 76 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 77 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 78 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 79 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 80 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 81 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 82 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 83 .text 84 ___ 85 86 my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2,$adl)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8","%r8"); 87 my ($acc0,$acc1,$acc2)=map("%r$_",(10..12)); 88 my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9"); 89 my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15)); 90 my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); 91 my $xmm_storage = 0; 92 if ($win64) { 93 $xmm_storage = 10*16; 94 } 95 my $xmm_store="0*16(%rbp)"; 96 my $r_store="$xmm_storage+0*16(%rbp)"; 97 my $s_store="$xmm_storage+1*16(%rbp)"; 98 my $len_store="$xmm_storage+2*16(%rbp)"; 99 my $state1_store="$xmm_storage+3*16(%rbp)"; 100 my $state2_store="$xmm_storage+4*16(%rbp)"; 101 my $tmp_store="$xmm_storage+5*16(%rbp)"; 102 my $ctr0_store="$xmm_storage+6*16(%rbp)"; 103 my $ctr1_store="$xmm_storage+7*16(%rbp)"; 104 my $ctr2_store="$xmm_storage+8*16(%rbp)"; 105 my $ctr3_store="$xmm_storage+9*16(%rbp)"; 106 107 sub chacha_qr { 108 my ($a,$b,$c,$d,$t,$dir)=@_; 109 $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/); 110 $code.="paddd $b, $a 111 pxor $a, $d 112 pshufb .Lrol16(%rip), $d 113 paddd $d, $c 114 pxor $c, $b 115 movdqa $b, $t 116 pslld \$12, $t 117 psrld \$20, $b 118 pxor $t, $b 119 paddd $b, $a 120 pxor $a, $d 121 pshufb .Lrol8(%rip), $d 122 paddd $d, $c 123 pxor $c, $b 124 movdqa $b, $t 125 pslld \$7, $t 126 psrld \$25, $b 127 pxor $t, $b\n"; 128 $code.="palignr \$4, $b, $b 129 palignr \$8, $c, $c 130 palignr \$12, $d, $d\n" if ($dir =~ /left/); 131 $code.="palignr \$12, $b, $b 132 palignr \$8, $c, $c 133 palignr \$4, $d, $d\n" if ($dir =~ /right/); 134 $code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/); 135 } 136 137 sub poly_add { 138 my ($src)=@_; 139 $code.="add 0+$src, $acc0 140 adc 8+$src, $acc1 141 adc \$1, $acc2\n"; 142 } 143 144 sub poly_stage1 { 145 $code.="mov 0+$r_store, %rax 146 mov %rax, $t2 147 mul $acc0 148 mov %rax, $t0 149 mov %rdx, $t1 150 mov 0+$r_store, %rax 151 mul $acc1 152 imulq $acc2, $t2 153 add %rax, $t1 154 adc %rdx, $t2\n"; 155 } 156 157 sub poly_stage2 { 158 $code.="mov 8+$r_store, %rax 159 mov %rax, $t3 160 mul $acc0 161 add %rax, $t1 162 adc \$0, %rdx 163 mov %rdx, $acc0 164 mov 8+$r_store, %rax 165 mul $acc1 166 add %rax, $t2 167 adc \$0, %rdx\n"; 168 } 169 170 sub poly_stage3 { 171 $code.="imulq $acc2, $t3 172 add $acc0, $t2 173 adc %rdx, $t3\n"; 174 } 175 176 # At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of 177 # r = [r1:r0] and acc = [acc2:acc1:acc0] 178 # r is 124 bits at most (due to clamping) and acc is 131 bits at most 179 # (acc2 is at most 4 before the addition and can be at most 6 when we add in 180 # the next block) therefore t is at most 255 bits big, and t3 is 63 bits. 181 sub poly_reduce_stage { 182 $code.="mov $t0, $acc0 183 mov $t1, $acc1 184 mov $t2, $acc2 185 and \$3, $acc2 # At this point acc2 is 2 bits at most (value of 3) 186 mov $t2, $t0 187 and \$-4, $t0 188 mov $t3, $t1 189 shrd \$2, $t3, $t2 190 shr \$2, $t3 191 add $t0, $t2 192 adc $t1, $t3 # No carry out since t3 is 61 bits and t1 is 63 bits 193 add $t2, $acc0 194 adc $t3, $acc1 195 adc \$0, $acc2\n"; # At this point acc2 has the value of 4 at most 196 } 197 198 sub poly_mul { 199 &poly_stage1(); 200 &poly_stage2(); 201 &poly_stage3(); 202 &poly_reduce_stage(); 203 } 204 205 sub prep_state { 206 my ($n)=@_; 207 $code.="movdqa .Lchacha20_consts(%rip), $A0 208 movdqa $state1_store, $B0 209 movdqa $state2_store, $C0\n"; 210 $code.="movdqa $A0, $A1 211 movdqa $B0, $B1 212 movdqa $C0, $C1\n" if ($n ge 2); 213 $code.="movdqa $A0, $A2 214 movdqa $B0, $B2 215 movdqa $C0, $C2\n" if ($n ge 3); 216 $code.="movdqa $A0, $A3 217 movdqa $B0, $B3 218 movdqa $C0, $C3\n" if ($n ge 4); 219 $code.="movdqa $ctr0_store, $D0 220 paddd .Lsse_inc(%rip), $D0 221 movdqa $D0, $ctr0_store\n" if ($n eq 1); 222 $code.="movdqa $ctr0_store, $D1 223 paddd .Lsse_inc(%rip), $D1 224 movdqa $D1, $D0 225 paddd .Lsse_inc(%rip), $D0 226 movdqa $D0, $ctr0_store 227 movdqa $D1, $ctr1_store\n" if ($n eq 2); 228 $code.="movdqa $ctr0_store, $D2 229 paddd .Lsse_inc(%rip), $D2 230 movdqa $D2, $D1 231 paddd .Lsse_inc(%rip), $D1 232 movdqa $D1, $D0 233 paddd .Lsse_inc(%rip), $D0 234 movdqa $D0, $ctr0_store 235 movdqa $D1, $ctr1_store 236 movdqa $D2, $ctr2_store\n" if ($n eq 3); 237 $code.="movdqa $ctr0_store, $D3 238 paddd .Lsse_inc(%rip), $D3 239 movdqa $D3, $D2 240 paddd .Lsse_inc(%rip), $D2 241 movdqa $D2, $D1 242 paddd .Lsse_inc(%rip), $D1 243 movdqa $D1, $D0 244 paddd .Lsse_inc(%rip), $D0 245 movdqa $D0, $ctr0_store 246 movdqa $D1, $ctr1_store 247 movdqa $D2, $ctr2_store 248 movdqa $D3, $ctr3_store\n" if ($n eq 4); 249 } 250 251 sub finalize_state { 252 my ($n)=@_; 253 $code.="paddd .Lchacha20_consts(%rip), $A3 254 paddd $state1_store, $B3 255 paddd $state2_store, $C3 256 paddd $ctr3_store, $D3\n" if ($n eq 4); 257 $code.="paddd .Lchacha20_consts(%rip), $A2 258 paddd $state1_store, $B2 259 paddd $state2_store, $C2 260 paddd $ctr2_store, $D2\n" if ($n ge 3); 261 $code.="paddd .Lchacha20_consts(%rip), $A1 262 paddd $state1_store, $B1 263 paddd $state2_store, $C1 264 paddd $ctr1_store, $D1\n" if ($n ge 2); 265 $code.="paddd .Lchacha20_consts(%rip), $A0 266 paddd $state1_store, $B0 267 paddd $state2_store, $C0 268 paddd $ctr0_store, $D0\n"; 269 } 270 271 sub xor_stream { 272 my ($A, $B, $C, $D, $offset)=@_; 273 $code.="movdqu 0*16 + $offset($inp), $A3 274 movdqu 1*16 + $offset($inp), $B3 275 movdqu 2*16 + $offset($inp), $C3 276 movdqu 3*16 + $offset($inp), $D3 277 pxor $A3, $A 278 pxor $B3, $B 279 pxor $C3, $C 280 pxor $D, $D3 281 movdqu $A, 0*16 + $offset($oup) 282 movdqu $B, 1*16 + $offset($oup) 283 movdqu $C, 2*16 + $offset($oup) 284 movdqu $D3, 3*16 + $offset($oup)\n"; 285 } 286 287 sub xor_stream_using_temp { 288 my ($A, $B, $C, $D, $offset, $temp)=@_; 289 $code.="movdqa $temp, $tmp_store 290 movdqu 0*16 + $offset($inp), $temp 291 pxor $A, $temp 292 movdqu $temp, 0*16 + $offset($oup) 293 movdqu 1*16 + $offset($inp), $temp 294 pxor $B, $temp 295 movdqu $temp, 1*16 + $offset($oup) 296 movdqu 2*16 + $offset($inp), $temp 297 pxor $C, $temp 298 movdqu $temp, 2*16 + $offset($oup) 299 movdqu 3*16 + $offset($inp), $temp 300 pxor $D, $temp 301 movdqu $temp, 3*16 + $offset($oup)\n"; 302 } 303 304 sub gen_chacha_round { 305 my ($rot1, $rot2, $shift)=@_; 306 my $round=""; 307 $round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20); 308 $round.="movdqa $rot2, $C0 309 paddd $B3, $A3 310 paddd $B2, $A2 311 paddd $B1, $A1 312 paddd $B0, $A0 313 pxor $A3, $D3 314 pxor $A2, $D2 315 pxor $A1, $D1 316 pxor $A0, $D0 317 pshufb $C0, $D3 318 pshufb $C0, $D2 319 pshufb $C0, $D1 320 pshufb $C0, $D0 321 movdqa $tmp_store, $C0 322 paddd $D3, $C3 323 paddd $D2, $C2 324 paddd $D1, $C1 325 paddd $D0, $C0 326 pxor $C3, $B3 327 pxor $C2, $B2 328 pxor $C1, $B1 329 pxor $C0, $B0 330 movdqa $C0, $tmp_store 331 movdqa $B3, $C0 332 psrld \$$rot1, $C0 333 pslld \$32-$rot1, $B3 334 pxor $C0, $B3 335 movdqa $B2, $C0 336 psrld \$$rot1, $C0 337 pslld \$32-$rot1, $B2 338 pxor $C0, $B2 339 movdqa $B1, $C0 340 psrld \$$rot1, $C0 341 pslld \$32-$rot1, $B1 342 pxor $C0, $B1 343 movdqa $B0, $C0 344 psrld \$$rot1, $C0 345 pslld \$32-$rot1, $B0 346 pxor $C0, $B0\n"; 347 ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); 348 ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); 349 $round.="movdqa $tmp_store, $C0 350 palignr \$$s1, $B3, $B3 351 palignr \$$s2, $C3, $C3 352 palignr \$$s3, $D3, $D3 353 palignr \$$s1, $B2, $B2 354 palignr \$$s2, $C2, $C2 355 palignr \$$s3, $D2, $D2 356 palignr \$$s1, $B1, $B1 357 palignr \$$s2, $C1, $C1 358 palignr \$$s3, $D1, $D1 359 palignr \$$s1, $B0, $B0 360 palignr \$$s2, $C0, $C0 361 palignr \$$s3, $D0, $D0\n" 362 if (($shift =~ /left/) || ($shift =~ /right/)); 363 return $round; 364 }; 365 366 $chacha_body = &gen_chacha_round(20, ".Lrol16(%rip)") . 367 &gen_chacha_round(25, ".Lrol8(%rip)", "left") . 368 &gen_chacha_round(20, ".Lrol16(%rip)") . 369 &gen_chacha_round(25, ".Lrol8(%rip)", "right"); 370 371 my @loop_body = split /\n/, $chacha_body; 372 373 sub emit_body { 374 my ($n)=@_; 375 for (my $i=0; $i < $n; $i++) { 376 $code=$code.shift(@loop_body)."\n"; 377 }; 378 } 379 380 { 381 ################################################################################ 382 # void poly_hash_ad_internal(); 383 $code.=" 384 .type poly_hash_ad_internal,\@abi-omnipotent 385 .align 64 386 poly_hash_ad_internal: 387 .cfi_startproc 388 .cfi_def_cfa rsp, 8 389 xor $acc0, $acc0 390 xor $acc1, $acc1 391 xor $acc2, $acc2 392 cmp \$13, $itr2 393 jne .Lhash_ad_loop 394 .Lpoly_fast_tls_ad: 395 # Special treatment for the TLS case of 13 bytes 396 mov ($adp), $acc0 397 mov 5($adp), $acc1 398 shr \$24, $acc1 399 mov \$1, $acc2\n"; 400 &poly_mul(); $code.=" 401 ret 402 .Lhash_ad_loop: 403 # Hash in 16 byte chunk 404 cmp \$16, $itr2 405 jb .Lhash_ad_tail\n"; 406 &poly_add("0($adp)"); 407 &poly_mul(); $code.=" 408 lea 1*16($adp), $adp 409 sub \$16, $itr2 410 jmp .Lhash_ad_loop 411 .Lhash_ad_tail: 412 cmp \$0, $itr2 413 je .Lhash_ad_done 414 # Hash last < 16 byte tail 415 xor $t0, $t0 416 xor $t1, $t1 417 xor $t2, $t2 418 add $itr2, $adp 419 .Lhash_ad_tail_loop: 420 shld \$8, $t0, $t1 421 shl \$8, $t0 422 movzxb -1($adp), $t2 423 xor $t2, $t0 424 dec $adp 425 dec $itr2 426 jne .Lhash_ad_tail_loop 427 428 add $t0, $acc0 429 adc $t1, $acc1 430 adc \$1, $acc2\n"; 431 &poly_mul(); $code.=" 432 # Finished AD 433 .Lhash_ad_done: 434 ret 435 .cfi_endproc 436 .size poly_hash_ad_internal, .-poly_hash_ad_internal\n"; 437 } 438 439 { 440 ################################################################################ 441 # void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext, 442 # size_t plaintext_len, const uint8_t *ad, 443 # size_t ad_len, 444 # union chacha20_poly1305_open_data *aead_data) 445 # 446 $code.=" 447 .globl chacha20_poly1305_open 448 .type chacha20_poly1305_open,\@function,6 449 .align 64 450 chacha20_poly1305_open: 451 .cfi_startproc 452 _CET_ENDBR 453 push %rbp 454 .cfi_push %rbp 455 push %rbx 456 .cfi_push %rbx 457 push %r12 458 .cfi_push %r12 459 push %r13 460 .cfi_push %r13 461 push %r14 462 .cfi_push %r14 463 push %r15 464 .cfi_push %r15 465 # We write the calculated authenticator back to keyp at the end, so save 466 # the pointer on the stack too. 467 push $keyp 468 .cfi_push $keyp 469 sub \$288 + $xmm_storage + 32, %rsp 470 .cfi_adjust_cfa_offset 288 + 32 471 472 lea 32(%rsp), %rbp 473 and \$-32, %rbp\n"; 474 $code.=" 475 movaps %xmm6,16*0+$xmm_store 476 movaps %xmm7,16*1+$xmm_store 477 movaps %xmm8,16*2+$xmm_store 478 movaps %xmm9,16*3+$xmm_store 479 movaps %xmm10,16*4+$xmm_store 480 movaps %xmm11,16*5+$xmm_store 481 movaps %xmm12,16*6+$xmm_store 482 movaps %xmm13,16*7+$xmm_store 483 movaps %xmm14,16*8+$xmm_store 484 movaps %xmm15,16*9+$xmm_store\n" if ($win64); 485 $code.=" 486 mov %rdx, $inl 487 mov $adl, 0+$len_store 488 mov $inl, 8+$len_store\n"; 489 $code.=" 490 mov OPENSSL_ia32cap_P+8(%rip), %eax 491 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present 492 xor \$`(1<<5) + (1<<8)`, %eax 493 jz chacha20_poly1305_open_avx2\n" if ($avx>1); 494 $code.=" 495 cmp \$128, $inl 496 jbe .Lopen_sse_128 497 # For long buffers, prepare the poly key first 498 movdqa .Lchacha20_consts(%rip), $A0 499 movdqu 0*16($keyp), $B0 500 movdqu 1*16($keyp), $C0 501 movdqu 2*16($keyp), $D0 502 503 movdqa $D0, $T1 504 # Store on stack, to free keyp 505 movdqa $B0, $state1_store 506 movdqa $C0, $state2_store 507 movdqa $D0, $ctr0_store 508 mov \$10, $acc0 509 .Lopen_sse_init_rounds:\n"; 510 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 511 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 512 dec $acc0 513 jne .Lopen_sse_init_rounds 514 # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 515 paddd .Lchacha20_consts(%rip), $A0 516 paddd $state1_store, $B0 517 # Clamp and store the key 518 pand .Lclamp(%rip), $A0 519 movdqa $A0, $r_store 520 movdqa $B0, $s_store 521 # Hash 522 mov $adl, $itr2 523 call poly_hash_ad_internal 524 .Lopen_sse_main_loop: 525 cmp \$16*16, $inl 526 jb .Lopen_sse_tail 527 # Load state, increment counter blocks\n"; 528 &prep_state(4); $code.=" 529 # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we 530 # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 531 mov \$4, $itr1 532 mov $inp, $itr2 533 .Lopen_sse_main_loop_rounds:\n"; 534 &emit_body(20); 535 &poly_add("0($itr2)"); $code.=" 536 lea 2*8($itr2), $itr2\n"; 537 &emit_body(20); 538 &poly_stage1(); 539 &emit_body(20); 540 &poly_stage2(); 541 &emit_body(20); 542 &poly_stage3(); 543 &emit_body(20); 544 &poly_reduce_stage(); 545 foreach $l (@loop_body) {$code.=$l."\n";} 546 @loop_body = split /\n/, $chacha_body; $code.=" 547 dec $itr1 548 jge .Lopen_sse_main_loop_rounds\n"; 549 &poly_add("0($itr2)"); 550 &poly_mul(); $code.=" 551 lea 2*8($itr2), $itr2 552 cmp \$-6, $itr1 553 jg .Lopen_sse_main_loop_rounds\n"; 554 &finalize_state(4); 555 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); 556 &xor_stream($A2, $B2, $C2, $D2, "4*16"); 557 &xor_stream($A1, $B1, $C1, $D1, "8*16"); 558 &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.=" 559 lea 16*16($inp), $inp 560 lea 16*16($oup), $oup 561 sub \$16*16, $inl 562 jmp .Lopen_sse_main_loop 563 .Lopen_sse_tail: 564 # Handle the various tail sizes efficiently 565 test $inl, $inl 566 jz .Lopen_sse_finalize 567 cmp \$12*16, $inl 568 ja .Lopen_sse_tail_256 569 cmp \$8*16, $inl 570 ja .Lopen_sse_tail_192 571 cmp \$4*16, $inl 572 ja .Lopen_sse_tail_128\n"; 573 ############################################################################### 574 # At most 64 bytes are left 575 &prep_state(1); $code.=" 576 xor $itr2, $itr2 577 mov $inl, $itr1 578 cmp \$16, $itr1 579 jb .Lopen_sse_tail_64_rounds 580 .Lopen_sse_tail_64_rounds_and_x1hash: \n"; 581 &poly_add("0($inp,$itr2)"); 582 &poly_mul(); $code.=" 583 sub \$16, $itr1 584 .Lopen_sse_tail_64_rounds: 585 add \$16, $itr2\n"; 586 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 587 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 588 cmp \$16, $itr1 589 jae .Lopen_sse_tail_64_rounds_and_x1hash 590 cmp \$10*16, $itr2 591 jne .Lopen_sse_tail_64_rounds\n"; 592 &finalize_state(1); $code.=" 593 jmp .Lopen_sse_tail_64_dec_loop 594 ############################################################################### 595 .Lopen_sse_tail_128:\n"; 596 # 65 - 128 bytes are left 597 &prep_state(2); $code.=" 598 mov $inl, $itr1 599 and \$-16, $itr1 600 xor $itr2, $itr2 601 .Lopen_sse_tail_128_rounds_and_x1hash: \n"; 602 &poly_add("0($inp,$itr2)"); 603 &poly_mul(); $code.=" 604 .Lopen_sse_tail_128_rounds: 605 add \$16, $itr2\n"; 606 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 607 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 608 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 609 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.=" 610 cmp $itr1, $itr2 611 jb .Lopen_sse_tail_128_rounds_and_x1hash 612 cmp \$10*16, $itr2 613 jne .Lopen_sse_tail_128_rounds\n"; 614 &finalize_state(2); 615 &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.=" 616 sub \$4*16, $inl 617 lea 4*16($inp), $inp 618 lea 4*16($oup), $oup 619 jmp .Lopen_sse_tail_64_dec_loop 620 ############################################################################### 621 .Lopen_sse_tail_192:\n"; 622 # 129 - 192 bytes are left 623 &prep_state(3); $code.=" 624 mov $inl, $itr1 625 mov \$10*16, $itr2 626 cmp \$10*16, $itr1 627 cmovg $itr2, $itr1 628 and \$-16, $itr1 629 xor $itr2, $itr2 630 .Lopen_sse_tail_192_rounds_and_x1hash: \n"; 631 &poly_add("0($inp,$itr2)"); 632 &poly_mul(); $code.=" 633 .Lopen_sse_tail_192_rounds: 634 add \$16, $itr2\n"; 635 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 636 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 637 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 638 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 639 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 640 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 641 cmp $itr1, $itr2 642 jb .Lopen_sse_tail_192_rounds_and_x1hash 643 cmp \$10*16, $itr2 644 jne .Lopen_sse_tail_192_rounds 645 cmp \$11*16, $inl 646 jb .Lopen_sse_tail_192_finish\n"; 647 &poly_add("10*16($inp)"); 648 &poly_mul(); $code.=" 649 cmp \$12*16, $inl 650 jb .Lopen_sse_tail_192_finish\n"; 651 &poly_add("11*16($inp)"); 652 &poly_mul(); $code.=" 653 .Lopen_sse_tail_192_finish: \n"; 654 &finalize_state(3); 655 &xor_stream($A2, $B2, $C2, $D2, "0*16"); 656 &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.=" 657 sub \$8*16, $inl 658 lea 8*16($inp), $inp 659 lea 8*16($oup), $oup 660 jmp .Lopen_sse_tail_64_dec_loop 661 ############################################################################### 662 .Lopen_sse_tail_256:\n"; 663 # 193 - 255 bytes are left 664 &prep_state(4); $code.=" 665 xor $itr2, $itr2 666 .Lopen_sse_tail_256_rounds_and_x1hash: \n"; 667 &poly_add("0($inp,$itr2)"); 668 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left"); 669 &chacha_qr($A1,$B1,$C1,$D1,$C3,"left"); 670 &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load"); 671 &poly_stage1(); 672 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load"); 673 &poly_stage2(); 674 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right"); 675 &chacha_qr($A1,$B1,$C1,$D1,$C3,"right"); 676 &poly_stage3(); 677 &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load"); 678 &poly_reduce_stage(); 679 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.=" 680 add \$16, $itr2 681 cmp \$10*16, $itr2 682 jb .Lopen_sse_tail_256_rounds_and_x1hash 683 684 mov $inl, $itr1 685 and \$-16, $itr1 686 .Lopen_sse_tail_256_hash: \n"; 687 &poly_add("0($inp,$itr2)"); 688 &poly_mul(); $code.=" 689 add \$16, $itr2 690 cmp $itr1, $itr2 691 jb .Lopen_sse_tail_256_hash\n"; 692 &finalize_state(4); 693 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); 694 &xor_stream($A2, $B2, $C2, $D2, "4*16"); 695 &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.=" 696 movdqa $tmp_store, $D0 697 sub \$12*16, $inl 698 lea 12*16($inp), $inp 699 lea 12*16($oup), $oup 700 ############################################################################### 701 # Decrypt the remaining data, 16B at a time, using existing stream 702 .Lopen_sse_tail_64_dec_loop: 703 cmp \$16, $inl 704 jb .Lopen_sse_tail_16_init 705 sub \$16, $inl 706 movdqu ($inp), $T0 707 pxor $T0, $A0 708 movdqu $A0, ($oup) 709 lea 16($inp), $inp 710 lea 16($oup), $oup 711 movdqa $B0, $A0 712 movdqa $C0, $B0 713 movdqa $D0, $C0 714 jmp .Lopen_sse_tail_64_dec_loop 715 .Lopen_sse_tail_16_init: 716 movdqa $A0, $A1 717 718 # Decrypt up to 16 bytes at the end. 719 .Lopen_sse_tail_16: 720 test $inl, $inl 721 jz .Lopen_sse_finalize 722 723 # Read the final bytes into $T0. They need to be read in reverse order so 724 # that they end up in the correct order in $T0. 725 pxor $T0, $T0 726 lea -1($inp,$inl), $inp 727 movq $inl, $itr2 728 .Lopen_sse_tail_16_compose: 729 pslldq \$1, $T0 730 pinsrb \$0, ($inp), $T0 731 sub \$1, $inp 732 sub \$1, $itr2 733 jnz .Lopen_sse_tail_16_compose 734 735 movq $T0, $t0 736 pextrq \$1, $T0, $t1 737 # The final bytes of keystream are in $A1. 738 pxor $A1, $T0 739 740 # Copy the plaintext bytes out. 741 .Lopen_sse_tail_16_extract: 742 pextrb \$0, $T0, ($oup) 743 psrldq \$1, $T0 744 add \$1, $oup 745 sub \$1, $inl 746 jne .Lopen_sse_tail_16_extract 747 748 add $t0, $acc0 749 adc $t1, $acc1 750 adc \$1, $acc2\n"; 751 &poly_mul(); $code.=" 752 753 .Lopen_sse_finalize:\n"; 754 &poly_add($len_store); 755 &poly_mul(); $code.=" 756 # Final reduce 757 mov $acc0, $t0 758 mov $acc1, $t1 759 mov $acc2, $t2 760 sub \$-5, $acc0 761 sbb \$-1, $acc1 762 sbb \$3, $acc2 763 cmovc $t0, $acc0 764 cmovc $t1, $acc1 765 cmovc $t2, $acc2 766 # Add in s part of the key 767 add 0+$s_store, $acc0 768 adc 8+$s_store, $acc1\n"; 769 770 $code.=" 771 movaps 16*0+$xmm_store, %xmm6 772 movaps 16*1+$xmm_store, %xmm7 773 movaps 16*2+$xmm_store, %xmm8 774 movaps 16*3+$xmm_store, %xmm9 775 movaps 16*4+$xmm_store, %xmm10 776 movaps 16*5+$xmm_store, %xmm11 777 movaps 16*6+$xmm_store, %xmm12 778 movaps 16*7+$xmm_store, %xmm13 779 movaps 16*8+$xmm_store, %xmm14 780 movaps 16*9+$xmm_store, %xmm15\n" if ($win64); 781 $code.=" 782 .cfi_remember_state 783 add \$288 + $xmm_storage + 32, %rsp 784 .cfi_adjust_cfa_offset -(288 + 32) 785 # The tag replaces the key on return 786 pop $keyp 787 .cfi_pop $keyp 788 mov $acc0, ($keyp) 789 mov $acc1, 8($keyp) 790 pop %r15 791 .cfi_pop %r15 792 pop %r14 793 .cfi_pop %r14 794 pop %r13 795 .cfi_pop %r13 796 pop %r12 797 .cfi_pop %r12 798 pop %rbx 799 .cfi_pop %rbx 800 pop %rbp 801 .cfi_pop %rbp 802 ret 803 ############################################################################### 804 .Lopen_sse_128: 805 .cfi_restore_state 806 movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 807 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 808 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 809 movdqu 2*16($keyp), $D0 810 movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1 811 movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2 812 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3 813 mov \$10, $acc0 814 815 .Lopen_sse_128_rounds: \n"; 816 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 817 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 818 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 819 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 820 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 821 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 822 dec $acc0 823 jnz .Lopen_sse_128_rounds 824 paddd .Lchacha20_consts(%rip), $A0 825 paddd .Lchacha20_consts(%rip), $A1 826 paddd .Lchacha20_consts(%rip), $A2 827 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 828 paddd $T2, $C1\npaddd $T2, $C2 829 paddd $T3, $D1 830 paddd .Lsse_inc(%rip), $T3 831 paddd $T3, $D2 832 # Clamp and store the key 833 pand .Lclamp(%rip), $A0 834 movdqa $A0, $r_store 835 movdqa $B0, $s_store 836 # Hash 837 mov $adl, $itr2 838 call poly_hash_ad_internal 839 .Lopen_sse_128_xor_hash: 840 cmp \$16, $inl 841 jb .Lopen_sse_tail_16 842 sub \$16, $inl\n"; 843 # Load for hashing 844 &poly_add("0*8($inp)"); $code.=" 845 # Load for decryption 846 movdqu 0*16($inp), $T0 847 pxor $T0, $A1 848 movdqu $A1, 0*16($oup) 849 lea 1*16($inp), $inp 850 lea 1*16($oup), $oup\n"; 851 &poly_mul(); $code.=" 852 # Shift the stream left 853 movdqa $B1, $A1 854 movdqa $C1, $B1 855 movdqa $D1, $C1 856 movdqa $A2, $D1 857 movdqa $B2, $A2 858 movdqa $C2, $B2 859 movdqa $D2, $C2 860 jmp .Lopen_sse_128_xor_hash 861 .size chacha20_poly1305_open, .-chacha20_poly1305_open 862 .cfi_endproc 863 864 ################################################################################ 865 ################################################################################ 866 # void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext, 867 # size_t plaintext_len, const uint8_t *ad, 868 # size_t ad_len, 869 # union chacha20_poly1305_seal_data *data); 870 .globl chacha20_poly1305_seal 871 .type chacha20_poly1305_seal,\@function,6 872 .align 64 873 chacha20_poly1305_seal: 874 .cfi_startproc 875 _CET_ENDBR 876 push %rbp 877 .cfi_push %rbp 878 push %rbx 879 .cfi_push %rbx 880 push %r12 881 .cfi_push %r12 882 push %r13 883 .cfi_push %r13 884 push %r14 885 .cfi_push %r14 886 push %r15 887 .cfi_push %r15 888 # We write the calculated authenticator back to keyp at the end, so save 889 # the pointer on the stack too. 890 push $keyp 891 .cfi_push $keyp 892 sub \$288 + $xmm_storage + 32, %rsp 893 .cfi_adjust_cfa_offset 288 + 32 894 lea 32(%rsp), %rbp 895 and \$-32, %rbp\n"; 896 $code.=" 897 movaps %xmm6,16*0+$xmm_store 898 movaps %xmm7,16*1+$xmm_store 899 movaps %xmm8,16*2+$xmm_store 900 movaps %xmm9,16*3+$xmm_store 901 movaps %xmm10,16*4+$xmm_store 902 movaps %xmm11,16*5+$xmm_store 903 movaps %xmm12,16*6+$xmm_store 904 movaps %xmm13,16*7+$xmm_store 905 movaps %xmm14,16*8+$xmm_store 906 movaps %xmm15,16*9+$xmm_store\n" if ($win64); 907 $code.=" 908 mov 56($keyp), $inl # extra_in_len 909 addq %rdx, $inl 910 mov $adl, 0+$len_store 911 mov $inl, 8+$len_store 912 mov %rdx, $inl\n"; 913 $code.=" 914 mov OPENSSL_ia32cap_P+8(%rip), %eax 915 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present 916 xor \$`(1<<5) + (1<<8)`, %eax 917 jz chacha20_poly1305_seal_avx2\n" if ($avx>1); 918 $code.=" 919 cmp \$128, $inl 920 jbe .Lseal_sse_128 921 # For longer buffers, prepare the poly key + some stream 922 movdqa .Lchacha20_consts(%rip), $A0 923 movdqu 0*16($keyp), $B0 924 movdqu 1*16($keyp), $C0 925 movdqu 2*16($keyp), $D0 926 927 movdqa $A0, $A1 928 movdqa $A0, $A2 929 movdqa $A0, $A3 930 movdqa $B0, $B1 931 movdqa $B0, $B2 932 movdqa $B0, $B3 933 movdqa $C0, $C1 934 movdqa $C0, $C2 935 movdqa $C0, $C3 936 movdqa $D0, $D3 937 paddd .Lsse_inc(%rip), $D0 938 movdqa $D0, $D2 939 paddd .Lsse_inc(%rip), $D0 940 movdqa $D0, $D1 941 paddd .Lsse_inc(%rip), $D0 942 # Store on stack 943 movdqa $B0, $state1_store 944 movdqa $C0, $state2_store 945 movdqa $D0, $ctr0_store 946 movdqa $D1, $ctr1_store 947 movdqa $D2, $ctr2_store 948 movdqa $D3, $ctr3_store 949 mov \$10, $acc0 950 .Lseal_sse_init_rounds: \n"; 951 foreach $l (@loop_body) {$code.=$l."\n";} 952 @loop_body = split /\n/, $chacha_body; $code.=" 953 dec $acc0 954 jnz .Lseal_sse_init_rounds\n"; 955 &finalize_state(4); $code.=" 956 # Clamp and store the key 957 pand .Lclamp(%rip), $A3 958 movdqa $A3, $r_store 959 movdqa $B3, $s_store 960 # Hash 961 mov $adl, $itr2 962 call poly_hash_ad_internal\n"; 963 &xor_stream($A2,$B2,$C2,$D2,"0*16"); 964 &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.=" 965 cmp \$12*16, $inl 966 ja .Lseal_sse_main_init 967 mov \$8*16, $itr1 968 sub \$8*16, $inl 969 lea 8*16($inp), $inp 970 jmp .Lseal_sse_128_tail_hash 971 .Lseal_sse_main_init:\n"; 972 &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.=" 973 mov \$12*16, $itr1 974 sub \$12*16, $inl 975 lea 12*16($inp), $inp 976 mov \$2, $itr1 977 mov \$8, $itr2 978 cmp \$4*16, $inl 979 jbe .Lseal_sse_tail_64 980 cmp \$8*16, $inl 981 jbe .Lseal_sse_tail_128 982 cmp \$12*16, $inl 983 jbe .Lseal_sse_tail_192 984 985 .Lseal_sse_main_loop: \n"; 986 # The main loop 987 &prep_state(4); $code.=" 988 .align 32 989 .Lseal_sse_main_rounds: \n"; 990 &emit_body(20); 991 &poly_add("0($oup)"); 992 &emit_body(20); 993 &poly_stage1(); 994 &emit_body(20); 995 &poly_stage2(); 996 &emit_body(20); 997 &poly_stage3(); 998 &emit_body(20); 999 &poly_reduce_stage(); 1000 foreach $l (@loop_body) {$code.=$l."\n";} 1001 @loop_body = split /\n/, $chacha_body; $code.=" 1002 lea 16($oup), $oup 1003 dec $itr2 1004 jge .Lseal_sse_main_rounds\n"; 1005 &poly_add("0*8($oup)"); 1006 &poly_mul(); $code.=" 1007 lea 16($oup), $oup 1008 dec $itr1 1009 jg .Lseal_sse_main_rounds\n"; 1010 1011 &finalize_state(4);$code.=" 1012 movdqa $D2, $tmp_store\n"; 1013 &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.=" 1014 movdqa $tmp_store, $D2\n"; 1015 &xor_stream($A2,$B2,$C2,$D2, 4*16); 1016 &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.=" 1017 cmp \$16*16, $inl 1018 ja .Lseal_sse_main_loop_xor 1019 1020 mov \$12*16, $itr1 1021 sub \$12*16, $inl 1022 lea 12*16($inp), $inp 1023 jmp .Lseal_sse_128_tail_hash 1024 .Lseal_sse_main_loop_xor: \n"; 1025 &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.=" 1026 lea 16*16($inp), $inp 1027 sub \$16*16, $inl 1028 mov \$6, $itr1 1029 mov \$4, $itr2 1030 cmp \$12*16, $inl 1031 jg .Lseal_sse_main_loop 1032 mov $inl, $itr1 1033 test $inl, $inl 1034 je .Lseal_sse_128_tail_hash 1035 mov \$6, $itr1 1036 cmp \$8*16, $inl 1037 ja .Lseal_sse_tail_192 1038 cmp \$4*16, $inl 1039 ja .Lseal_sse_tail_128 1040 ############################################################################### 1041 .Lseal_sse_tail_64: \n"; 1042 &prep_state(1); $code.=" 1043 .Lseal_sse_tail_64_rounds_and_x2hash: \n"; 1044 &poly_add("0($oup)"); 1045 &poly_mul(); $code.=" 1046 lea 16($oup), $oup 1047 .Lseal_sse_tail_64_rounds_and_x1hash: \n"; 1048 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1049 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1050 &poly_add("0($oup)"); 1051 &poly_mul(); $code.=" 1052 lea 16($oup), $oup 1053 dec $itr1 1054 jg .Lseal_sse_tail_64_rounds_and_x2hash 1055 dec $itr2 1056 jge .Lseal_sse_tail_64_rounds_and_x1hash\n"; 1057 &finalize_state(1); $code.=" 1058 jmp .Lseal_sse_128_tail_xor 1059 ############################################################################### 1060 .Lseal_sse_tail_128:\n"; 1061 &prep_state(2); $code.=" 1062 .Lseal_sse_tail_128_rounds_and_x2hash: \n"; 1063 &poly_add("0($oup)"); 1064 &poly_mul(); $code.=" 1065 lea 16($oup), $oup 1066 .Lseal_sse_tail_128_rounds_and_x1hash: \n"; 1067 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1068 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1069 &poly_add("0($oup)"); 1070 &poly_mul(); 1071 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1072 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 1073 lea 16($oup), $oup 1074 dec $itr1 1075 jg .Lseal_sse_tail_128_rounds_and_x2hash 1076 dec $itr2 1077 jge .Lseal_sse_tail_128_rounds_and_x1hash\n"; 1078 &finalize_state(2); 1079 &xor_stream($A1,$B1,$C1,$D1,0*16); $code.=" 1080 mov \$4*16, $itr1 1081 sub \$4*16, $inl 1082 lea 4*16($inp), $inp 1083 jmp .Lseal_sse_128_tail_hash 1084 ############################################################################### 1085 .Lseal_sse_tail_192:\n"; 1086 &prep_state(3); $code.=" 1087 .Lseal_sse_tail_192_rounds_and_x2hash: \n"; 1088 &poly_add("0($oup)"); 1089 &poly_mul(); $code.=" 1090 lea 16($oup), $oup 1091 .Lseal_sse_tail_192_rounds_and_x1hash: \n"; 1092 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1093 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1094 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 1095 &poly_add("0($oup)"); 1096 &poly_mul(); 1097 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1098 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 1099 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1100 lea 16($oup), $oup 1101 dec $itr1 1102 jg .Lseal_sse_tail_192_rounds_and_x2hash 1103 dec $itr2 1104 jge .Lseal_sse_tail_192_rounds_and_x1hash\n"; 1105 &finalize_state(3); 1106 &xor_stream($A2,$B2,$C2,$D2,0*16); 1107 &xor_stream($A1,$B1,$C1,$D1,4*16); $code.=" 1108 mov \$8*16, $itr1 1109 sub \$8*16, $inl 1110 lea 8*16($inp), $inp 1111 ############################################################################### 1112 .Lseal_sse_128_tail_hash: 1113 cmp \$16, $itr1 1114 jb .Lseal_sse_128_tail_xor\n"; 1115 &poly_add("0($oup)"); 1116 &poly_mul(); $code.=" 1117 sub \$16, $itr1 1118 lea 16($oup), $oup 1119 jmp .Lseal_sse_128_tail_hash 1120 1121 .Lseal_sse_128_tail_xor: 1122 cmp \$16, $inl 1123 jb .Lseal_sse_tail_16 1124 sub \$16, $inl 1125 # Load for decryption 1126 movdqu 0*16($inp), $T0 1127 pxor $T0, $A0 1128 movdqu $A0, 0*16($oup) 1129 # Then hash 1130 add 0*8($oup), $acc0 1131 adc 1*8($oup), $acc1 1132 adc \$1, $acc2 1133 lea 1*16($inp), $inp 1134 lea 1*16($oup), $oup\n"; 1135 &poly_mul(); $code.=" 1136 # Shift the stream left 1137 movdqa $B0, $A0 1138 movdqa $C0, $B0 1139 movdqa $D0, $C0 1140 movdqa $A1, $D0 1141 movdqa $B1, $A1 1142 movdqa $C1, $B1 1143 movdqa $D1, $C1 1144 jmp .Lseal_sse_128_tail_xor 1145 1146 .Lseal_sse_tail_16: 1147 test $inl, $inl 1148 jz .Lprocess_blocks_of_extra_in 1149 # We can only load the PT one byte at a time to avoid buffer overread 1150 mov $inl, $itr2 1151 mov $inl, $itr1 1152 lea -1($inp,$inl), $inp 1153 pxor $T3, $T3 1154 .Lseal_sse_tail_16_compose: 1155 pslldq \$1, $T3 1156 pinsrb \$0, ($inp), $T3 1157 lea -1($inp), $inp 1158 dec $itr1 1159 jne .Lseal_sse_tail_16_compose 1160 1161 # XOR the keystream with the plaintext. 1162 pxor $A0, $T3 1163 1164 # Write ciphertext out, byte-by-byte. 1165 movq $inl, $itr1 1166 movdqu $T3, $A0 1167 .Lseal_sse_tail_16_extract: 1168 pextrb \$0, $A0, ($oup) 1169 psrldq \$1, $A0 1170 add \$1, $oup 1171 sub \$1, $itr1 1172 jnz .Lseal_sse_tail_16_extract 1173 1174 # $T3 contains the final (partial, non-empty) block of ciphertext which 1175 # needs to be fed into the Poly1305 state. The right-most $inl bytes of it 1176 # are valid. We need to fill it with extra_in bytes until full, or until we 1177 # run out of bytes. 1178 # 1179 # $keyp points to the tag output, which is actually a struct with the 1180 # extra_in pointer and length at offset 48. 1181 movq 288 + $xmm_storage + 32(%rsp), $keyp 1182 movq 56($keyp), $t1 # extra_in_len 1183 movq 48($keyp), $t0 # extra_in 1184 test $t1, $t1 1185 jz .Lprocess_partial_block # Common case: no bytes of extra_in 1186 1187 movq \$16, $t2 1188 subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3. 1189 cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len 1190 # (note that AT&T syntax reverses the arguments) 1191 jge .Lload_extra_in 1192 movq $t1, $t2 1193 1194 .Lload_extra_in: 1195 # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load 1196 # into $T3. They are loaded in reverse order. 1197 leaq -1($t0,$t2), $inp 1198 # Update extra_in and extra_in_len to reflect the bytes that are about to 1199 # be read. 1200 addq $t2, $t0 1201 subq $t2, $t1 1202 movq $t0, 48($keyp) 1203 movq $t1, 56($keyp) 1204 1205 # Update $itr2, which is used to select the mask later on, to reflect the 1206 # extra bytes about to be added. 1207 addq $t2, $itr2 1208 1209 # Load $t2 bytes of extra_in into $T2. 1210 pxor $T2, $T2 1211 .Lload_extra_load_loop: 1212 pslldq \$1, $T2 1213 pinsrb \$0, ($inp), $T2 1214 lea -1($inp), $inp 1215 sub \$1, $t2 1216 jnz .Lload_extra_load_loop 1217 1218 # Shift $T2 up the length of the remainder from the main encryption. Sadly, 1219 # the shift for an XMM register has to be a constant, thus we loop to do 1220 # this. 1221 movq $inl, $t2 1222 1223 .Lload_extra_shift_loop: 1224 pslldq \$1, $T2 1225 sub \$1, $t2 1226 jnz .Lload_extra_shift_loop 1227 1228 # Mask $T3 (the remainder from the main encryption) so that superfluous 1229 # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are 1230 # disjoint and so we can merge them with an OR. 1231 lea .Land_masks(%rip), $t2 1232 shl \$4, $inl 1233 pand -16($t2,$inl), $T3 1234 1235 # Merge $T2 into $T3, forming the remainder block. 1236 por $T2, $T3 1237 1238 # The block of ciphertext + extra_in is ready to be included in the 1239 # Poly1305 state. 1240 movq $T3, $t0 1241 pextrq \$1, $T3, $t1 1242 add $t0, $acc0 1243 adc $t1, $acc1 1244 adc \$1, $acc2\n"; 1245 &poly_mul(); $code.=" 1246 1247 .Lprocess_blocks_of_extra_in: 1248 # There may be additional bytes of extra_in to process. 1249 movq 288+32+$xmm_storage (%rsp), $keyp 1250 movq 48($keyp), $inp # extra_in 1251 movq 56($keyp), $itr2 # extra_in_len 1252 movq $itr2, $itr1 1253 shr \$4, $itr2 # number of blocks 1254 1255 .Lprocess_extra_hash_loop: 1256 jz process_extra_in_trailer\n"; 1257 &poly_add("0($inp)"); 1258 &poly_mul(); $code.=" 1259 leaq 16($inp), $inp 1260 subq \$1, $itr2 1261 jmp .Lprocess_extra_hash_loop 1262 process_extra_in_trailer: 1263 andq \$15, $itr1 # remaining num bytes (<16) of extra_in 1264 movq $itr1, $inl 1265 jz .Ldo_length_block 1266 leaq -1($inp,$itr1), $inp 1267 1268 .Lprocess_extra_in_trailer_load: 1269 pslldq \$1, $T3 1270 pinsrb \$0, ($inp), $T3 1271 lea -1($inp), $inp 1272 sub \$1, $itr1 1273 jnz .Lprocess_extra_in_trailer_load 1274 1275 .Lprocess_partial_block: 1276 # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0 1277 lea .Land_masks(%rip), $t2 1278 shl \$4, $inl 1279 pand -16($t2,$inl), $T3 1280 movq $T3, $t0 1281 pextrq \$1, $T3, $t1 1282 add $t0, $acc0 1283 adc $t1, $acc1 1284 adc \$1, $acc2\n"; 1285 &poly_mul(); $code.=" 1286 1287 .Ldo_length_block:\n"; 1288 &poly_add($len_store); 1289 &poly_mul(); $code.=" 1290 # Final reduce 1291 mov $acc0, $t0 1292 mov $acc1, $t1 1293 mov $acc2, $t2 1294 sub \$-5, $acc0 1295 sbb \$-1, $acc1 1296 sbb \$3, $acc2 1297 cmovc $t0, $acc0 1298 cmovc $t1, $acc1 1299 cmovc $t2, $acc2 1300 # Add in s part of the key 1301 add 0+$s_store, $acc0 1302 adc 8+$s_store, $acc1\n"; 1303 1304 $code.=" 1305 movaps 16*0+$xmm_store, %xmm6 1306 movaps 16*1+$xmm_store, %xmm7 1307 movaps 16*2+$xmm_store, %xmm8 1308 movaps 16*3+$xmm_store, %xmm9 1309 movaps 16*4+$xmm_store, %xmm10 1310 movaps 16*5+$xmm_store, %xmm11 1311 movaps 16*6+$xmm_store, %xmm12 1312 movaps 16*7+$xmm_store, %xmm13 1313 movaps 16*8+$xmm_store, %xmm14 1314 movaps 16*9+$xmm_store, %xmm15\n" if ($win64); 1315 $code.=" 1316 .cfi_remember_state 1317 add \$288 + $xmm_storage + 32, %rsp 1318 .cfi_adjust_cfa_offset -(288 + 32) 1319 # The tag replaces the key on return 1320 pop $keyp 1321 .cfi_pop $keyp 1322 mov $acc0, ($keyp) 1323 mov $acc1, 8($keyp) 1324 pop %r15 1325 .cfi_pop %r15 1326 pop %r14 1327 .cfi_pop %r14 1328 pop %r13 1329 .cfi_pop %r13 1330 pop %r12 1331 .cfi_pop %r12 1332 pop %rbx 1333 .cfi_pop %rbx 1334 pop %rbp 1335 .cfi_pop %rbp 1336 ret 1337 ################################################################################ 1338 .Lseal_sse_128: 1339 .cfi_restore_state 1340 movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 1341 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 1342 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 1343 movdqu 2*16($keyp), $D2 1344 movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0 1345 movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1 1346 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3 1347 mov \$10, $acc0 1348 1349 .Lseal_sse_128_rounds:\n"; 1350 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1351 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1352 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 1353 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1354 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 1355 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1356 dec $acc0 1357 jnz .Lseal_sse_128_rounds 1358 paddd .Lchacha20_consts(%rip), $A0 1359 paddd .Lchacha20_consts(%rip), $A1 1360 paddd .Lchacha20_consts(%rip), $A2 1361 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 1362 paddd $T2, $C0\npaddd $T2, $C1 1363 paddd $T3, $D0 1364 paddd .Lsse_inc(%rip), $T3 1365 paddd $T3, $D1 1366 # Clamp and store the key 1367 pand .Lclamp(%rip), $A2 1368 movdqa $A2, $r_store 1369 movdqa $B2, $s_store 1370 # Hash 1371 mov %r8, $itr2 1372 call poly_hash_ad_internal 1373 jmp .Lseal_sse_128_tail_xor 1374 .size chacha20_poly1305_seal, .-chacha20_poly1305_seal 1375 .cfi_endproc\n"; 1376 } 1377 1378 if ($avx>1) { 1379 1380 ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15)); 1381 my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15)); 1382 ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); 1383 $state1_store="$xmm_storage+2*32(%rbp)"; 1384 $state2_store="$xmm_storage+3*32(%rbp)"; 1385 $tmp_store="$xmm_storage+4*32(%rbp)"; 1386 $ctr0_store="$xmm_storage+5*32(%rbp)"; 1387 $ctr1_store="$xmm_storage+6*32(%rbp)"; 1388 $ctr2_store="$xmm_storage+7*32(%rbp)"; 1389 $ctr3_store="$xmm_storage+8*32(%rbp)"; 1390 1391 sub chacha_qr_avx2 { 1392 my ($a,$b,$c,$d,$t,$dir)=@_; 1393 $code.=<<___ if ($dir =~ /store/); 1394 vmovdqa $t, $tmp_store 1395 ___ 1396 $code.=<<___; 1397 vpaddd $b, $a, $a 1398 vpxor $a, $d, $d 1399 vpshufb .Lrol16(%rip), $d, $d 1400 vpaddd $d, $c, $c 1401 vpxor $c, $b, $b 1402 vpsrld \$20, $b, $t 1403 vpslld \$12, $b, $b 1404 vpxor $t, $b, $b 1405 vpaddd $b, $a, $a 1406 vpxor $a, $d, $d 1407 vpshufb .Lrol8(%rip), $d, $d 1408 vpaddd $d, $c, $c 1409 vpxor $c, $b, $b 1410 vpslld \$7, $b, $t 1411 vpsrld \$25, $b, $b 1412 vpxor $t, $b, $b 1413 ___ 1414 $code.=<<___ if ($dir =~ /left/); 1415 vpalignr \$12, $d, $d, $d 1416 vpalignr \$8, $c, $c, $c 1417 vpalignr \$4, $b, $b, $b 1418 ___ 1419 $code.=<<___ if ($dir =~ /right/); 1420 vpalignr \$4, $d, $d, $d 1421 vpalignr \$8, $c, $c, $c 1422 vpalignr \$12, $b, $b, $b 1423 ___ 1424 $code.=<<___ if ($dir =~ /load/); 1425 vmovdqa $tmp_store, $t 1426 ___ 1427 } 1428 1429 sub prep_state_avx2 { 1430 my ($n)=@_; 1431 $code.=<<___; 1432 vmovdqa .Lchacha20_consts(%rip), $A0 1433 vmovdqa $state1_store, $B0 1434 vmovdqa $state2_store, $C0 1435 ___ 1436 $code.=<<___ if ($n ge 2); 1437 vmovdqa $A0, $A1 1438 vmovdqa $B0, $B1 1439 vmovdqa $C0, $C1 1440 ___ 1441 $code.=<<___ if ($n ge 3); 1442 vmovdqa $A0, $A2 1443 vmovdqa $B0, $B2 1444 vmovdqa $C0, $C2 1445 ___ 1446 $code.=<<___ if ($n ge 4); 1447 vmovdqa $A0, $A3 1448 vmovdqa $B0, $B3 1449 vmovdqa $C0, $C3 1450 ___ 1451 $code.=<<___ if ($n eq 1); 1452 vmovdqa .Lavx2_inc(%rip), $D0 1453 vpaddd $ctr0_store, $D0, $D0 1454 vmovdqa $D0, $ctr0_store 1455 ___ 1456 $code.=<<___ if ($n eq 2); 1457 vmovdqa .Lavx2_inc(%rip), $D0 1458 vpaddd $ctr0_store, $D0, $D1 1459 vpaddd $D1, $D0, $D0 1460 vmovdqa $D0, $ctr0_store 1461 vmovdqa $D1, $ctr1_store 1462 ___ 1463 $code.=<<___ if ($n eq 3); 1464 vmovdqa .Lavx2_inc(%rip), $D0 1465 vpaddd $ctr0_store, $D0, $D2 1466 vpaddd $D2, $D0, $D1 1467 vpaddd $D1, $D0, $D0 1468 vmovdqa $D0, $ctr0_store 1469 vmovdqa $D1, $ctr1_store 1470 vmovdqa $D2, $ctr2_store 1471 ___ 1472 $code.=<<___ if ($n eq 4); 1473 vmovdqa .Lavx2_inc(%rip), $D0 1474 vpaddd $ctr0_store, $D0, $D3 1475 vpaddd $D3, $D0, $D2 1476 vpaddd $D2, $D0, $D1 1477 vpaddd $D1, $D0, $D0 1478 vmovdqa $D3, $ctr3_store 1479 vmovdqa $D2, $ctr2_store 1480 vmovdqa $D1, $ctr1_store 1481 vmovdqa $D0, $ctr0_store 1482 ___ 1483 } 1484 1485 sub finalize_state_avx2 { 1486 my ($n)=@_; 1487 $code.=<<___ if ($n eq 4); 1488 vpaddd .Lchacha20_consts(%rip), $A3, $A3 1489 vpaddd $state1_store, $B3, $B3 1490 vpaddd $state2_store, $C3, $C3 1491 vpaddd $ctr3_store, $D3, $D3 1492 ___ 1493 $code.=<<___ if ($n ge 3); 1494 vpaddd .Lchacha20_consts(%rip), $A2, $A2 1495 vpaddd $state1_store, $B2, $B2 1496 vpaddd $state2_store, $C2, $C2 1497 vpaddd $ctr2_store, $D2, $D2 1498 ___ 1499 $code.=<<___ if ($n ge 2); 1500 vpaddd .Lchacha20_consts(%rip), $A1, $A1 1501 vpaddd $state1_store, $B1, $B1 1502 vpaddd $state2_store, $C1, $C1 1503 vpaddd $ctr1_store, $D1, $D1 1504 ___ 1505 $code.=<<___; 1506 vpaddd .Lchacha20_consts(%rip), $A0, $A0 1507 vpaddd $state1_store, $B0, $B0 1508 vpaddd $state2_store, $C0, $C0 1509 vpaddd $ctr0_store, $D0, $D0 1510 ___ 1511 } 1512 1513 sub xor_stream_avx2 { 1514 my ($A, $B, $C, $D, $offset, $hlp)=@_; 1515 $code.=<<___; 1516 vperm2i128 \$0x02, $A, $B, $hlp 1517 vperm2i128 \$0x13, $A, $B, $B 1518 vperm2i128 \$0x02, $C, $D, $A 1519 vperm2i128 \$0x13, $C, $D, $C 1520 vpxor 0*32+$offset($inp), $hlp, $hlp 1521 vpxor 1*32+$offset($inp), $A, $A 1522 vpxor 2*32+$offset($inp), $B, $B 1523 vpxor 3*32+$offset($inp), $C, $C 1524 vmovdqu $hlp, 0*32+$offset($oup) 1525 vmovdqu $A, 1*32+$offset($oup) 1526 vmovdqu $B, 2*32+$offset($oup) 1527 vmovdqu $C, 3*32+$offset($oup) 1528 ___ 1529 } 1530 1531 sub finish_stream_avx2 { 1532 my ($A, $B, $C, $D, $hlp)=@_; 1533 $code.=<<___; 1534 vperm2i128 \$0x13, $A, $B, $hlp 1535 vperm2i128 \$0x02, $A, $B, $A 1536 vperm2i128 \$0x02, $C, $D, $B 1537 vperm2i128 \$0x13, $C, $D, $D 1538 vmovdqa $hlp, $C 1539 ___ 1540 } 1541 1542 sub poly_stage1_mulx { 1543 $code.=<<___; 1544 mov 0+$r_store, %rdx 1545 mov %rdx, $t2 1546 mulx $acc0, $t0, $t1 1547 mulx $acc1, %rax, %rdx 1548 imulq $acc2, $t2 1549 add %rax, $t1 1550 adc %rdx, $t2 1551 ___ 1552 } 1553 1554 sub poly_stage2_mulx { 1555 $code.=<<___; 1556 mov 8+$r_store, %rdx 1557 mulx $acc0, $acc0, %rax 1558 add $acc0, $t1 1559 mulx $acc1, $acc1, $t3 1560 adc $acc1, $t2 1561 adc \$0, $t3 1562 imulq $acc2, %rdx 1563 ___ 1564 } 1565 1566 sub poly_stage3_mulx { 1567 $code.=<<___; 1568 add %rax, $t2 1569 adc %rdx, $t3 1570 ___ 1571 } 1572 1573 sub poly_mul_mulx { 1574 &poly_stage1_mulx(); 1575 &poly_stage2_mulx(); 1576 &poly_stage3_mulx(); 1577 &poly_reduce_stage(); 1578 } 1579 1580 sub gen_chacha_round_avx2 { 1581 my ($rot1, $rot2, $shift)=@_; 1582 my $round=""; 1583 $round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20); 1584 $round=$round ."vmovdqa $rot2, $C0 1585 vpaddd $B3, $A3, $A3 1586 vpaddd $B2, $A2, $A2 1587 vpaddd $B1, $A1, $A1 1588 vpaddd $B0, $A0, $A0 1589 vpxor $A3, $D3, $D3 1590 vpxor $A2, $D2, $D2 1591 vpxor $A1, $D1, $D1 1592 vpxor $A0, $D0, $D0 1593 vpshufb $C0, $D3, $D3 1594 vpshufb $C0, $D2, $D2 1595 vpshufb $C0, $D1, $D1 1596 vpshufb $C0, $D0, $D0 1597 vpaddd $D3, $C3, $C3 1598 vpaddd $D2, $C2, $C2 1599 vpaddd $D1, $C1, $C1 1600 vpaddd $tmp_store, $D0, $C0 1601 vpxor $C3, $B3, $B3 1602 vpxor $C2, $B2, $B2 1603 vpxor $C1, $B1, $B1 1604 vpxor $C0, $B0, $B0 1605 vmovdqa $C0, $tmp_store 1606 vpsrld \$$rot1, $B3, $C0 1607 vpslld \$32-$rot1, $B3, $B3 1608 vpxor $C0, $B3, $B3 1609 vpsrld \$$rot1, $B2, $C0 1610 vpslld \$32-$rot1, $B2, $B2 1611 vpxor $C0, $B2, $B2 1612 vpsrld \$$rot1, $B1, $C0 1613 vpslld \$32-$rot1, $B1, $B1 1614 vpxor $C0, $B1, $B1 1615 vpsrld \$$rot1, $B0, $C0 1616 vpslld \$32-$rot1, $B0, $B0 1617 vpxor $C0, $B0, $B0\n"; 1618 ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); 1619 ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); 1620 $round=$round ."vmovdqa $tmp_store, $C0 1621 vpalignr \$$s1, $B3, $B3, $B3 1622 vpalignr \$$s2, $C3, $C3, $C3 1623 vpalignr \$$s3, $D3, $D3, $D3 1624 vpalignr \$$s1, $B2, $B2, $B2 1625 vpalignr \$$s2, $C2, $C2, $C2 1626 vpalignr \$$s3, $D2, $D2, $D2 1627 vpalignr \$$s1, $B1, $B1, $B1 1628 vpalignr \$$s2, $C1, $C1, $C1 1629 vpalignr \$$s3, $D1, $D1, $D1 1630 vpalignr \$$s1, $B0, $B0, $B0 1631 vpalignr \$$s2, $C0, $C0, $C0 1632 vpalignr \$$s3, $D0, $D0, $D0\n" 1633 if (($shift =~ /left/) || ($shift =~ /right/)); 1634 return $round; 1635 }; 1636 1637 $chacha_body = &gen_chacha_round_avx2(20, ".Lrol16(%rip)") . 1638 &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "left") . 1639 &gen_chacha_round_avx2(20, ".Lrol16(%rip)") . 1640 &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "right"); 1641 1642 @loop_body = split /\n/, $chacha_body; 1643 1644 $code.=" 1645 ############################################################################### 1646 .type chacha20_poly1305_open_avx2,\@abi-omnipotent 1647 .align 64 1648 chacha20_poly1305_open_avx2: 1649 .cfi_startproc 1650 1651 # Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here 1652 .cfi_push %rbp 1653 .cfi_push %rbx 1654 .cfi_push %r12 1655 .cfi_push %r13 1656 .cfi_push %r14 1657 .cfi_push %r15 1658 .cfi_push $keyp 1659 .cfi_adjust_cfa_offset 288 + 32 1660 1661 vzeroupper 1662 vmovdqa .Lchacha20_consts(%rip), $A0 1663 vbroadcasti128 0*16($keyp), $B0 1664 vbroadcasti128 1*16($keyp), $C0 1665 vbroadcasti128 2*16($keyp), $D0 1666 vpaddd .Lavx2_init(%rip), $D0, $D0 1667 cmp \$6*32, $inl 1668 jbe .Lopen_avx2_192 1669 cmp \$10*32, $inl 1670 jbe .Lopen_avx2_320 1671 1672 vmovdqa $B0, $state1_store 1673 vmovdqa $C0, $state2_store 1674 vmovdqa $D0, $ctr0_store 1675 mov \$10, $acc0 1676 .Lopen_avx2_init_rounds: \n"; 1677 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1678 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1679 dec $acc0 1680 jne .Lopen_avx2_init_rounds 1681 vpaddd .Lchacha20_consts(%rip), $A0, $A0 1682 vpaddd $state1_store, $B0, $B0 1683 vpaddd $state2_store, $C0, $C0 1684 vpaddd $ctr0_store, $D0, $D0 1685 1686 vperm2i128 \$0x02, $A0, $B0, $T0 1687 # Clamp and store key 1688 vpand .Lclamp(%rip), $T0, $T0 1689 vmovdqa $T0, $r_store 1690 # Stream for the first 64 bytes 1691 vperm2i128 \$0x13, $A0, $B0, $A0 1692 vperm2i128 \$0x13, $C0, $D0, $B0 1693 # Hash AD + first 64 bytes 1694 mov $adl, $itr2 1695 call poly_hash_ad_internal 1696 # Hash first 64 bytes 1697 xor $itr1, $itr1 1698 .Lopen_avx2_init_hash: \n"; 1699 &poly_add("0($inp,$itr1)"); 1700 &poly_mul(); $code.=" 1701 add \$16, $itr1 1702 cmp \$2*32, $itr1 1703 jne .Lopen_avx2_init_hash 1704 # Decrypt first 64 bytes 1705 vpxor 0*32($inp), $A0, $A0 1706 vpxor 1*32($inp), $B0, $B0 1707 # Store first 64 bytes of decrypted data 1708 vmovdqu $A0, 0*32($oup) 1709 vmovdqu $B0, 1*32($oup) 1710 lea 2*32($inp), $inp 1711 lea 2*32($oup), $oup 1712 sub \$2*32, $inl 1713 .Lopen_avx2_main_loop: 1714 # Hash and decrypt 512 bytes each iteration 1715 cmp \$16*32, $inl 1716 jb .Lopen_avx2_main_loop_done\n"; 1717 &prep_state_avx2(4); $code.=" 1718 xor $itr1, $itr1 1719 .Lopen_avx2_main_loop_rounds: \n"; 1720 &poly_add("0*8($inp,$itr1)"); 1721 &emit_body(10); 1722 &poly_stage1_mulx(); 1723 &emit_body(9); 1724 &poly_stage2_mulx(); 1725 &emit_body(12); 1726 &poly_stage3_mulx(); 1727 &emit_body(10); 1728 &poly_reduce_stage(); 1729 &emit_body(9); 1730 &poly_add("2*8($inp,$itr1)"); 1731 &emit_body(8); 1732 &poly_stage1_mulx(); 1733 &emit_body(18); 1734 &poly_stage2_mulx(); 1735 &emit_body(18); 1736 &poly_stage3_mulx(); 1737 &emit_body(9); 1738 &poly_reduce_stage(); 1739 &emit_body(8); 1740 &poly_add("4*8($inp,$itr1)"); $code.=" 1741 lea 6*8($itr1), $itr1\n"; 1742 &emit_body(18); 1743 &poly_stage1_mulx(); 1744 &emit_body(8); 1745 &poly_stage2_mulx(); 1746 &emit_body(8); 1747 &poly_stage3_mulx(); 1748 &emit_body(18); 1749 &poly_reduce_stage(); 1750 foreach $l (@loop_body) {$code.=$l."\n";} 1751 @loop_body = split /\n/, $chacha_body; $code.=" 1752 cmp \$10*6*8, $itr1 1753 jne .Lopen_avx2_main_loop_rounds\n"; 1754 &finalize_state_avx2(4); $code.=" 1755 vmovdqa $A0, $tmp_store\n"; 1756 &poly_add("10*6*8($inp)"); 1757 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 1758 vmovdqa $tmp_store, $A0\n"; 1759 &poly_mul(); 1760 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 1761 &poly_add("10*6*8+2*8($inp)"); 1762 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 1763 &poly_mul(); 1764 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" 1765 lea 16*32($inp), $inp 1766 lea 16*32($oup), $oup 1767 sub \$16*32, $inl 1768 jmp .Lopen_avx2_main_loop 1769 .Lopen_avx2_main_loop_done: 1770 test $inl, $inl 1771 vzeroupper 1772 je .Lopen_sse_finalize 1773 1774 cmp \$12*32, $inl 1775 ja .Lopen_avx2_tail_512 1776 cmp \$8*32, $inl 1777 ja .Lopen_avx2_tail_384 1778 cmp \$4*32, $inl 1779 ja .Lopen_avx2_tail_256\n"; 1780 ############################################################################### 1781 # 1-128 bytes left 1782 &prep_state_avx2(1); $code.=" 1783 xor $itr2, $itr2 1784 mov $inl, $itr1 1785 and \$-16, $itr1 1786 test $itr1, $itr1 1787 je .Lopen_avx2_tail_128_rounds # Have nothing to hash 1788 .Lopen_avx2_tail_128_rounds_and_x1hash: \n"; 1789 &poly_add("0*8($inp,$itr2)"); 1790 &poly_mul(); $code.=" 1791 .Lopen_avx2_tail_128_rounds: 1792 add \$16, $itr2\n"; 1793 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1794 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1795 cmp $itr1, $itr2 1796 jb .Lopen_avx2_tail_128_rounds_and_x1hash 1797 cmp \$160, $itr2 1798 jne .Lopen_avx2_tail_128_rounds\n"; 1799 &finalize_state_avx2(1); 1800 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 1801 jmp .Lopen_avx2_tail_128_xor 1802 ############################################################################### 1803 .Lopen_avx2_tail_256: \n"; 1804 # 129-256 bytes left 1805 &prep_state_avx2(2); $code.=" 1806 mov $inl, $tmp_store 1807 mov $inl, $itr1 1808 sub \$4*32, $itr1 1809 shr \$4, $itr1 1810 mov \$10, $itr2 1811 cmp \$10, $itr1 1812 cmovg $itr2, $itr1 1813 mov $inp, $inl 1814 xor $itr2, $itr2 1815 .Lopen_avx2_tail_256_rounds_and_x1hash: \n"; 1816 &poly_add("0*8($inl)"); 1817 &poly_mul_mulx(); $code.=" 1818 lea 16($inl), $inl 1819 .Lopen_avx2_tail_256_rounds: \n"; 1820 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1821 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.=" 1822 inc $itr2\n"; 1823 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1824 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1825 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1826 cmp $itr1, $itr2 1827 jb .Lopen_avx2_tail_256_rounds_and_x1hash 1828 cmp \$10, $itr2 1829 jne .Lopen_avx2_tail_256_rounds 1830 mov $inl, $itr2 1831 sub $inp, $inl 1832 mov $inl, $itr1 1833 mov $tmp_store, $inl 1834 .Lopen_avx2_tail_256_hash: 1835 add \$16, $itr1 1836 cmp $inl, $itr1 1837 jg .Lopen_avx2_tail_256_done\n"; 1838 &poly_add("0*8($itr2)"); 1839 &poly_mul_mulx(); $code.=" 1840 lea 16($itr2), $itr2 1841 jmp .Lopen_avx2_tail_256_hash 1842 .Lopen_avx2_tail_256_done: \n"; 1843 &finalize_state_avx2(2); 1844 &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0); 1845 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" 1846 lea 4*32($inp), $inp 1847 lea 4*32($oup), $oup 1848 sub \$4*32, $inl 1849 jmp .Lopen_avx2_tail_128_xor 1850 ############################################################################### 1851 .Lopen_avx2_tail_384: \n"; 1852 # 257-383 bytes left 1853 &prep_state_avx2(3); $code.=" 1854 mov $inl, $tmp_store 1855 mov $inl, $itr1 1856 sub \$8*32, $itr1 1857 shr \$4, $itr1 1858 add \$6, $itr1 1859 mov \$10, $itr2 1860 cmp \$10, $itr1 1861 cmovg $itr2, $itr1 1862 mov $inp, $inl 1863 xor $itr2, $itr2 1864 .Lopen_avx2_tail_384_rounds_and_x2hash: \n"; 1865 &poly_add("0*8($inl)"); 1866 &poly_mul_mulx(); $code.=" 1867 lea 16($inl), $inl 1868 .Lopen_avx2_tail_384_rounds_and_x1hash: \n"; 1869 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 1870 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1871 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1872 &poly_add("0*8($inl)"); 1873 &poly_mul(); $code.=" 1874 lea 16($inl), $inl 1875 inc $itr2\n"; 1876 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); 1877 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1878 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1879 cmp $itr1, $itr2 1880 jb .Lopen_avx2_tail_384_rounds_and_x2hash 1881 cmp \$10, $itr2 1882 jne .Lopen_avx2_tail_384_rounds_and_x1hash 1883 mov $inl, $itr2 1884 sub $inp, $inl 1885 mov $inl, $itr1 1886 mov $tmp_store, $inl 1887 .Lopen_avx2_384_tail_hash: 1888 add \$16, $itr1 1889 cmp $inl, $itr1 1890 jg .Lopen_avx2_384_tail_done\n"; 1891 &poly_add("0*8($itr2)"); 1892 &poly_mul_mulx(); $code.=" 1893 lea 16($itr2), $itr2 1894 jmp .Lopen_avx2_384_tail_hash 1895 .Lopen_avx2_384_tail_done: \n"; 1896 &finalize_state_avx2(3); 1897 &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0); 1898 &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0); 1899 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" 1900 lea 8*32($inp), $inp 1901 lea 8*32($oup), $oup 1902 sub \$8*32, $inl 1903 jmp .Lopen_avx2_tail_128_xor 1904 ############################################################################### 1905 .Lopen_avx2_tail_512: \n"; 1906 # 384-512 bytes left 1907 &prep_state_avx2(4); $code.=" 1908 xor $itr1, $itr1 1909 mov $inp, $itr2 1910 .Lopen_avx2_tail_512_rounds_and_x2hash: \n"; 1911 &poly_add("0*8($itr2)"); 1912 &poly_mul(); $code.=" 1913 lea 2*8($itr2), $itr2 1914 .Lopen_avx2_tail_512_rounds_and_x1hash: \n"; 1915 &emit_body(37); 1916 &poly_add("0*8($itr2)"); 1917 &poly_mul_mulx(); 1918 &emit_body(48); 1919 &poly_add("2*8($itr2)"); 1920 &poly_mul_mulx(); $code.=" 1921 lea 4*8($itr2), $itr2\n"; 1922 foreach $l (@loop_body) {$code.=$l."\n";} 1923 @loop_body = split /\n/, $chacha_body; $code.=" 1924 inc $itr1 1925 cmp \$4, $itr1 1926 jl .Lopen_avx2_tail_512_rounds_and_x2hash 1927 cmp \$10, $itr1 1928 jne .Lopen_avx2_tail_512_rounds_and_x1hash 1929 mov $inl, $itr1 1930 sub \$12*32, $itr1 1931 and \$-16, $itr1 1932 .Lopen_avx2_tail_512_hash: 1933 test $itr1, $itr1 1934 je .Lopen_avx2_tail_512_done\n"; 1935 &poly_add("0*8($itr2)"); 1936 &poly_mul_mulx(); $code.=" 1937 lea 2*8($itr2), $itr2 1938 sub \$2*8, $itr1 1939 jmp .Lopen_avx2_tail_512_hash 1940 .Lopen_avx2_tail_512_done: \n"; 1941 &finalize_state_avx2(4); $code.=" 1942 vmovdqa $A0, $tmp_store\n"; 1943 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 1944 vmovdqa $tmp_store, $A0\n"; 1945 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 1946 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 1947 &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.=" 1948 lea 12*32($inp), $inp 1949 lea 12*32($oup), $oup 1950 sub \$12*32, $inl 1951 .Lopen_avx2_tail_128_xor: 1952 cmp \$32, $inl 1953 jb .Lopen_avx2_tail_32_xor 1954 sub \$32, $inl 1955 vpxor ($inp), $A0, $A0 1956 vmovdqu $A0, ($oup) 1957 lea 1*32($inp), $inp 1958 lea 1*32($oup), $oup 1959 vmovdqa $B0, $A0 1960 vmovdqa $C0, $B0 1961 vmovdqa $D0, $C0 1962 jmp .Lopen_avx2_tail_128_xor 1963 .Lopen_avx2_tail_32_xor: 1964 cmp \$16, $inl 1965 vmovdqa $A0x, $A1x 1966 jb .Lopen_avx2_exit 1967 sub \$16, $inl 1968 #load for decryption 1969 vpxor ($inp), $A0x, $A1x 1970 vmovdqu $A1x, ($oup) 1971 lea 1*16($inp), $inp 1972 lea 1*16($oup), $oup 1973 vperm2i128 \$0x11, $A0, $A0, $A0 1974 vmovdqa $A0x, $A1x 1975 .Lopen_avx2_exit: 1976 vzeroupper 1977 jmp .Lopen_sse_tail_16 1978 ############################################################################### 1979 .Lopen_avx2_192: 1980 vmovdqa $A0, $A1 1981 vmovdqa $A0, $A2 1982 vmovdqa $B0, $B1 1983 vmovdqa $B0, $B2 1984 vmovdqa $C0, $C1 1985 vmovdqa $C0, $C2 1986 vpaddd .Lavx2_inc(%rip), $D0, $D1 1987 vmovdqa $D0, $T2 1988 vmovdqa $D1, $T3 1989 mov \$10, $acc0 1990 .Lopen_avx2_192_rounds: \n"; 1991 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1992 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1993 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1994 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 1995 dec $acc0 1996 jne .Lopen_avx2_192_rounds 1997 vpaddd $A2, $A0, $A0 1998 vpaddd $A2, $A1, $A1 1999 vpaddd $B2, $B0, $B0 2000 vpaddd $B2, $B1, $B1 2001 vpaddd $C2, $C0, $C0 2002 vpaddd $C2, $C1, $C1 2003 vpaddd $T2, $D0, $D0 2004 vpaddd $T3, $D1, $D1 2005 vperm2i128 \$0x02, $A0, $B0, $T0 2006 # Clamp and store the key 2007 vpand .Lclamp(%rip), $T0, $T0 2008 vmovdqa $T0, $r_store 2009 # Stream for up to 192 bytes 2010 vperm2i128 \$0x13, $A0, $B0, $A0 2011 vperm2i128 \$0x13, $C0, $D0, $B0 2012 vperm2i128 \$0x02, $A1, $B1, $C0 2013 vperm2i128 \$0x02, $C1, $D1, $D0 2014 vperm2i128 \$0x13, $A1, $B1, $A1 2015 vperm2i128 \$0x13, $C1, $D1, $B1 2016 .Lopen_avx2_short: 2017 mov $adl, $itr2 2018 call poly_hash_ad_internal 2019 .Lopen_avx2_short_hash_and_xor_loop: 2020 cmp \$32, $inl 2021 jb .Lopen_avx2_short_tail_32 2022 sub \$32, $inl\n"; 2023 # Load + hash 2024 &poly_add("0*8($inp)"); 2025 &poly_mul(); 2026 &poly_add("2*8($inp)"); 2027 &poly_mul(); $code.=" 2028 # Load + decrypt 2029 vpxor ($inp), $A0, $A0 2030 vmovdqu $A0, ($oup) 2031 lea 1*32($inp), $inp 2032 lea 1*32($oup), $oup 2033 # Shift stream 2034 vmovdqa $B0, $A0 2035 vmovdqa $C0, $B0 2036 vmovdqa $D0, $C0 2037 vmovdqa $A1, $D0 2038 vmovdqa $B1, $A1 2039 vmovdqa $C1, $B1 2040 vmovdqa $D1, $C1 2041 vmovdqa $A2, $D1 2042 vmovdqa $B2, $A2 2043 jmp .Lopen_avx2_short_hash_and_xor_loop 2044 .Lopen_avx2_short_tail_32: 2045 cmp \$16, $inl 2046 vmovdqa $A0x, $A1x 2047 jb .Lopen_avx2_short_tail_32_exit 2048 sub \$16, $inl\n"; 2049 &poly_add("0*8($inp)"); 2050 &poly_mul(); $code.=" 2051 vpxor ($inp), $A0x, $A3x 2052 vmovdqu $A3x, ($oup) 2053 lea 1*16($inp), $inp 2054 lea 1*16($oup), $oup 2055 vextracti128 \$1, $A0, $A1x 2056 .Lopen_avx2_short_tail_32_exit: 2057 vzeroupper 2058 jmp .Lopen_sse_tail_16 2059 ############################################################################### 2060 .Lopen_avx2_320: 2061 vmovdqa $A0, $A1 2062 vmovdqa $A0, $A2 2063 vmovdqa $B0, $B1 2064 vmovdqa $B0, $B2 2065 vmovdqa $C0, $C1 2066 vmovdqa $C0, $C2 2067 vpaddd .Lavx2_inc(%rip), $D0, $D1 2068 vpaddd .Lavx2_inc(%rip), $D1, $D2 2069 vmovdqa $B0, $T1 2070 vmovdqa $C0, $T2 2071 vmovdqa $D0, $ctr0_store 2072 vmovdqa $D1, $ctr1_store 2073 vmovdqa $D2, $ctr2_store 2074 mov \$10, $acc0 2075 .Lopen_avx2_320_rounds: \n"; 2076 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2077 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2078 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2079 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2080 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2081 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2082 dec $acc0 2083 jne .Lopen_avx2_320_rounds 2084 vpaddd .Lchacha20_consts(%rip), $A0, $A0 2085 vpaddd .Lchacha20_consts(%rip), $A1, $A1 2086 vpaddd .Lchacha20_consts(%rip), $A2, $A2 2087 vpaddd $T1, $B0, $B0 2088 vpaddd $T1, $B1, $B1 2089 vpaddd $T1, $B2, $B2 2090 vpaddd $T2, $C0, $C0 2091 vpaddd $T2, $C1, $C1 2092 vpaddd $T2, $C2, $C2 2093 vpaddd $ctr0_store, $D0, $D0 2094 vpaddd $ctr1_store, $D1, $D1 2095 vpaddd $ctr2_store, $D2, $D2 2096 vperm2i128 \$0x02, $A0, $B0, $T0 2097 # Clamp and store the key 2098 vpand .Lclamp(%rip), $T0, $T0 2099 vmovdqa $T0, $r_store 2100 # Stream for up to 320 bytes 2101 vperm2i128 \$0x13, $A0, $B0, $A0 2102 vperm2i128 \$0x13, $C0, $D0, $B0 2103 vperm2i128 \$0x02, $A1, $B1, $C0 2104 vperm2i128 \$0x02, $C1, $D1, $D0 2105 vperm2i128 \$0x13, $A1, $B1, $A1 2106 vperm2i128 \$0x13, $C1, $D1, $B1 2107 vperm2i128 \$0x02, $A2, $B2, $C1 2108 vperm2i128 \$0x02, $C2, $D2, $D1 2109 vperm2i128 \$0x13, $A2, $B2, $A2 2110 vperm2i128 \$0x13, $C2, $D2, $B2 2111 jmp .Lopen_avx2_short 2112 .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 2113 .cfi_endproc 2114 ############################################################################### 2115 ############################################################################### 2116 .type chacha20_poly1305_seal_avx2,\@abi-omnipotent 2117 .align 64 2118 chacha20_poly1305_seal_avx2: 2119 .cfi_startproc 2120 2121 # Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here 2122 .cfi_push %rbp 2123 .cfi_push %rbx 2124 .cfi_push %r12 2125 .cfi_push %r13 2126 .cfi_push %r14 2127 .cfi_push %r15 2128 .cfi_push $keyp 2129 .cfi_adjust_cfa_offset 288 + 32 2130 2131 vzeroupper 2132 vmovdqa .Lchacha20_consts(%rip), $A0 2133 vbroadcasti128 0*16($keyp), $B0 2134 vbroadcasti128 1*16($keyp), $C0 2135 vbroadcasti128 2*16($keyp), $D0 2136 vpaddd .Lavx2_init(%rip), $D0, $D0 2137 cmp \$6*32, $inl 2138 jbe .Lseal_avx2_192 2139 cmp \$10*32, $inl 2140 jbe .Lseal_avx2_320 2141 vmovdqa $A0, $A1 2142 vmovdqa $A0, $A2 2143 vmovdqa $A0, $A3 2144 vmovdqa $B0, $B1 2145 vmovdqa $B0, $B2 2146 vmovdqa $B0, $B3 2147 vmovdqa $B0, $state1_store 2148 vmovdqa $C0, $C1 2149 vmovdqa $C0, $C2 2150 vmovdqa $C0, $C3 2151 vmovdqa $C0, $state2_store 2152 vmovdqa $D0, $D3 2153 vpaddd .Lavx2_inc(%rip), $D3, $D2 2154 vpaddd .Lavx2_inc(%rip), $D2, $D1 2155 vpaddd .Lavx2_inc(%rip), $D1, $D0 2156 vmovdqa $D0, $ctr0_store 2157 vmovdqa $D1, $ctr1_store 2158 vmovdqa $D2, $ctr2_store 2159 vmovdqa $D3, $ctr3_store 2160 mov \$10, $acc0 2161 .Lseal_avx2_init_rounds: \n"; 2162 foreach $l (@loop_body) {$code.=$l."\n";} 2163 @loop_body = split /\n/, $chacha_body; $code.=" 2164 dec $acc0 2165 jnz .Lseal_avx2_init_rounds\n"; 2166 &finalize_state_avx2(4); $code.=" 2167 vperm2i128 \$0x13, $C3, $D3, $C3 2168 vperm2i128 \$0x02, $A3, $B3, $D3 2169 vperm2i128 \$0x13, $A3, $B3, $A3 2170 vpand .Lclamp(%rip), $D3, $D3 2171 vmovdqa $D3, $r_store 2172 mov $adl, $itr2 2173 call poly_hash_ad_internal 2174 # Safely store 320 bytes (otherwise would handle with optimized call) 2175 vpxor 0*32($inp), $A3, $A3 2176 vpxor 1*32($inp), $C3, $C3 2177 vmovdqu $A3, 0*32($oup) 2178 vmovdqu $C3, 1*32($oup)\n"; 2179 &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3); 2180 &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3); 2181 &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.=" 2182 lea 10*32($inp), $inp 2183 sub \$10*32, $inl 2184 mov \$10*32, $itr1 2185 cmp \$4*32, $inl 2186 jbe .Lseal_avx2_short_hash_remainder 2187 vpxor 0*32($inp), $A0, $A0 2188 vpxor 1*32($inp), $B0, $B0 2189 vpxor 2*32($inp), $C0, $C0 2190 vpxor 3*32($inp), $D0, $D0 2191 vmovdqu $A0, 10*32($oup) 2192 vmovdqu $B0, 11*32($oup) 2193 vmovdqu $C0, 12*32($oup) 2194 vmovdqu $D0, 13*32($oup) 2195 lea 4*32($inp), $inp 2196 sub \$4*32, $inl 2197 mov \$8, $itr1 2198 mov \$2, $itr2 2199 cmp \$4*32, $inl 2200 jbe .Lseal_avx2_tail_128 2201 cmp \$8*32, $inl 2202 jbe .Lseal_avx2_tail_256 2203 cmp \$12*32, $inl 2204 jbe .Lseal_avx2_tail_384 2205 cmp \$16*32, $inl 2206 jbe .Lseal_avx2_tail_512\n"; 2207 # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop 2208 &prep_state_avx2(4); 2209 foreach $l (@loop_body) {$code.=$l."\n";} 2210 @loop_body = split /\n/, $chacha_body; 2211 &emit_body(41); 2212 @loop_body = split /\n/, $chacha_body; $code.=" 2213 sub \$16, $oup 2214 mov \$9, $itr1 2215 jmp .Lseal_avx2_main_loop_rounds_entry 2216 .align 32 2217 .Lseal_avx2_main_loop: \n"; 2218 &prep_state_avx2(4); $code.=" 2219 mov \$10, $itr1 2220 .align 32 2221 .Lseal_avx2_main_loop_rounds: \n"; 2222 &poly_add("0*8($oup)"); 2223 &emit_body(10); 2224 &poly_stage1_mulx(); 2225 &emit_body(9); 2226 &poly_stage2_mulx(); 2227 &emit_body(12); 2228 &poly_stage3_mulx(); 2229 &emit_body(10); 2230 &poly_reduce_stage(); $code.=" 2231 .Lseal_avx2_main_loop_rounds_entry: \n"; 2232 &emit_body(9); 2233 &poly_add("2*8($oup)"); 2234 &emit_body(8); 2235 &poly_stage1_mulx(); 2236 &emit_body(18); 2237 &poly_stage2_mulx(); 2238 &emit_body(18); 2239 &poly_stage3_mulx(); 2240 &emit_body(9); 2241 &poly_reduce_stage(); 2242 &emit_body(8); 2243 &poly_add("4*8($oup)"); $code.=" 2244 lea 6*8($oup), $oup\n"; 2245 &emit_body(18); 2246 &poly_stage1_mulx(); 2247 &emit_body(8); 2248 &poly_stage2_mulx(); 2249 &emit_body(8); 2250 &poly_stage3_mulx(); 2251 &emit_body(18); 2252 &poly_reduce_stage(); 2253 foreach $l (@loop_body) {$code.=$l."\n";} 2254 @loop_body = split /\n/, $chacha_body; $code.=" 2255 dec $itr1 2256 jne .Lseal_avx2_main_loop_rounds\n"; 2257 &finalize_state_avx2(4); $code.=" 2258 vmovdqa $A0, $tmp_store\n"; 2259 &poly_add("0*8($oup)"); 2260 &poly_mul_mulx(); 2261 &poly_add("2*8($oup)"); 2262 &poly_mul_mulx(); $code.=" 2263 lea 4*8($oup), $oup\n"; 2264 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 2265 vmovdqa $tmp_store, $A0\n"; 2266 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 2267 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 2268 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" 2269 lea 16*32($inp), $inp 2270 sub \$16*32, $inl 2271 cmp \$16*32, $inl 2272 jg .Lseal_avx2_main_loop 2273 \n"; 2274 &poly_add("0*8($oup)"); 2275 &poly_mul_mulx(); 2276 &poly_add("2*8($oup)"); 2277 &poly_mul_mulx(); $code.=" 2278 lea 4*8($oup), $oup 2279 mov \$10, $itr1 2280 xor $itr2, $itr2 2281 2282 cmp \$12*32, $inl 2283 ja .Lseal_avx2_tail_512 2284 cmp \$8*32, $inl 2285 ja .Lseal_avx2_tail_384 2286 cmp \$4*32, $inl 2287 ja .Lseal_avx2_tail_256 2288 ############################################################################### 2289 .Lseal_avx2_tail_128:\n"; 2290 &prep_state_avx2(1); $code.=" 2291 .Lseal_avx2_tail_128_rounds_and_3xhash: \n"; 2292 &poly_add("0($oup)"); 2293 &poly_mul_mulx(); $code.=" 2294 lea 2*8($oup), $oup 2295 .Lseal_avx2_tail_128_rounds_and_2xhash: \n"; 2296 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2297 &poly_add("0*8($oup)"); 2298 &poly_mul_mulx(); 2299 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2300 &poly_add("2*8($oup)"); 2301 &poly_mul_mulx(); $code.=" 2302 lea 4*8($oup), $oup 2303 dec $itr1 2304 jg .Lseal_avx2_tail_128_rounds_and_3xhash 2305 dec $itr2 2306 jge .Lseal_avx2_tail_128_rounds_and_2xhash\n"; 2307 &finalize_state_avx2(1); 2308 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2309 jmp .Lseal_avx2_short_loop 2310 ############################################################################### 2311 .Lseal_avx2_tail_256:\n"; 2312 &prep_state_avx2(2); $code.=" 2313 .Lseal_avx2_tail_256_rounds_and_3xhash: \n"; 2314 &poly_add("0($oup)"); 2315 &poly_mul(); $code.=" 2316 lea 2*8($oup), $oup 2317 .Lseal_avx2_tail_256_rounds_and_2xhash: \n"; 2318 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2319 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2320 &poly_add("0*8($oup)"); 2321 &poly_mul(); 2322 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2323 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2324 &poly_add("2*8($oup)"); 2325 &poly_mul(); $code.=" 2326 lea 4*8($oup), $oup 2327 dec $itr1 2328 jg .Lseal_avx2_tail_256_rounds_and_3xhash 2329 dec $itr2 2330 jge .Lseal_avx2_tail_256_rounds_and_2xhash\n"; 2331 &finalize_state_avx2(2); 2332 &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0); 2333 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2334 mov \$4*32, $itr1 2335 lea 4*32($inp), $inp 2336 sub \$4*32, $inl 2337 jmp .Lseal_avx2_short_hash_remainder 2338 ############################################################################### 2339 .Lseal_avx2_tail_384:\n"; 2340 &prep_state_avx2(3); $code.=" 2341 .Lseal_avx2_tail_384_rounds_and_3xhash: \n"; 2342 &poly_add("0($oup)"); 2343 &poly_mul(); $code.=" 2344 lea 2*8($oup), $oup 2345 .Lseal_avx2_tail_384_rounds_and_2xhash: \n"; 2346 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2347 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2348 &poly_add("0*8($oup)"); 2349 &poly_mul(); 2350 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2351 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2352 &poly_add("2*8($oup)"); 2353 &poly_mul(); 2354 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2355 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2356 lea 4*8($oup), $oup 2357 dec $itr1 2358 jg .Lseal_avx2_tail_384_rounds_and_3xhash 2359 dec $itr2 2360 jge .Lseal_avx2_tail_384_rounds_and_2xhash\n"; 2361 &finalize_state_avx2(3); 2362 &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0); 2363 &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0); 2364 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2365 mov \$8*32, $itr1 2366 lea 8*32($inp), $inp 2367 sub \$8*32, $inl 2368 jmp .Lseal_avx2_short_hash_remainder 2369 ############################################################################### 2370 .Lseal_avx2_tail_512:\n"; 2371 &prep_state_avx2(4); $code.=" 2372 .Lseal_avx2_tail_512_rounds_and_3xhash: \n"; 2373 &poly_add("0($oup)"); 2374 &poly_mul_mulx(); $code.=" 2375 lea 2*8($oup), $oup 2376 .Lseal_avx2_tail_512_rounds_and_2xhash: \n"; 2377 &emit_body(20); 2378 &poly_add("0*8($oup)"); 2379 &emit_body(20); 2380 &poly_stage1_mulx(); 2381 &emit_body(20); 2382 &poly_stage2_mulx(); 2383 &emit_body(20); 2384 &poly_stage3_mulx(); 2385 &emit_body(20); 2386 &poly_reduce_stage(); 2387 &emit_body(20); 2388 &poly_add("2*8($oup)"); 2389 &emit_body(20); 2390 &poly_stage1_mulx(); 2391 &emit_body(20); 2392 &poly_stage2_mulx(); 2393 &emit_body(20); 2394 &poly_stage3_mulx(); 2395 &emit_body(20); 2396 &poly_reduce_stage(); 2397 foreach $l (@loop_body) {$code.=$l."\n";} 2398 @loop_body = split /\n/, $chacha_body; $code.=" 2399 lea 4*8($oup), $oup 2400 dec $itr1 2401 jg .Lseal_avx2_tail_512_rounds_and_3xhash 2402 dec $itr2 2403 jge .Lseal_avx2_tail_512_rounds_and_2xhash\n"; 2404 &finalize_state_avx2(4); $code.=" 2405 vmovdqa $A0, $tmp_store\n"; 2406 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 2407 vmovdqa $tmp_store, $A0\n"; 2408 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 2409 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 2410 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2411 mov \$12*32, $itr1 2412 lea 12*32($inp), $inp 2413 sub \$12*32, $inl 2414 jmp .Lseal_avx2_short_hash_remainder 2415 ################################################################################ 2416 .Lseal_avx2_320: 2417 vmovdqa $A0, $A1 2418 vmovdqa $A0, $A2 2419 vmovdqa $B0, $B1 2420 vmovdqa $B0, $B2 2421 vmovdqa $C0, $C1 2422 vmovdqa $C0, $C2 2423 vpaddd .Lavx2_inc(%rip), $D0, $D1 2424 vpaddd .Lavx2_inc(%rip), $D1, $D2 2425 vmovdqa $B0, $T1 2426 vmovdqa $C0, $T2 2427 vmovdqa $D0, $ctr0_store 2428 vmovdqa $D1, $ctr1_store 2429 vmovdqa $D2, $ctr2_store 2430 mov \$10, $acc0 2431 .Lseal_avx2_320_rounds: \n"; 2432 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2433 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2434 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2435 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2436 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2437 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2438 dec $acc0 2439 jne .Lseal_avx2_320_rounds 2440 vpaddd .Lchacha20_consts(%rip), $A0, $A0 2441 vpaddd .Lchacha20_consts(%rip), $A1, $A1 2442 vpaddd .Lchacha20_consts(%rip), $A2, $A2 2443 vpaddd $T1, $B0, $B0 2444 vpaddd $T1, $B1, $B1 2445 vpaddd $T1, $B2, $B2 2446 vpaddd $T2, $C0, $C0 2447 vpaddd $T2, $C1, $C1 2448 vpaddd $T2, $C2, $C2 2449 vpaddd $ctr0_store, $D0, $D0 2450 vpaddd $ctr1_store, $D1, $D1 2451 vpaddd $ctr2_store, $D2, $D2 2452 vperm2i128 \$0x02, $A0, $B0, $T0 2453 # Clamp and store the key 2454 vpand .Lclamp(%rip), $T0, $T0 2455 vmovdqa $T0, $r_store 2456 # Stream for up to 320 bytes 2457 vperm2i128 \$0x13, $A0, $B0, $A0 2458 vperm2i128 \$0x13, $C0, $D0, $B0 2459 vperm2i128 \$0x02, $A1, $B1, $C0 2460 vperm2i128 \$0x02, $C1, $D1, $D0 2461 vperm2i128 \$0x13, $A1, $B1, $A1 2462 vperm2i128 \$0x13, $C1, $D1, $B1 2463 vperm2i128 \$0x02, $A2, $B2, $C1 2464 vperm2i128 \$0x02, $C2, $D2, $D1 2465 vperm2i128 \$0x13, $A2, $B2, $A2 2466 vperm2i128 \$0x13, $C2, $D2, $B2 2467 jmp .Lseal_avx2_short 2468 ################################################################################ 2469 .Lseal_avx2_192: 2470 vmovdqa $A0, $A1 2471 vmovdqa $A0, $A2 2472 vmovdqa $B0, $B1 2473 vmovdqa $B0, $B2 2474 vmovdqa $C0, $C1 2475 vmovdqa $C0, $C2 2476 vpaddd .Lavx2_inc(%rip), $D0, $D1 2477 vmovdqa $D0, $T2 2478 vmovdqa $D1, $T3 2479 mov \$10, $acc0 2480 .Lseal_avx2_192_rounds: \n"; 2481 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2482 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2483 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2484 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 2485 dec $acc0 2486 jne .Lseal_avx2_192_rounds 2487 vpaddd $A2, $A0, $A0 2488 vpaddd $A2, $A1, $A1 2489 vpaddd $B2, $B0, $B0 2490 vpaddd $B2, $B1, $B1 2491 vpaddd $C2, $C0, $C0 2492 vpaddd $C2, $C1, $C1 2493 vpaddd $T2, $D0, $D0 2494 vpaddd $T3, $D1, $D1 2495 vperm2i128 \$0x02, $A0, $B0, $T0 2496 # Clamp and store the key 2497 vpand .Lclamp(%rip), $T0, $T0 2498 vmovdqa $T0, $r_store 2499 # Stream for up to 192 bytes 2500 vperm2i128 \$0x13, $A0, $B0, $A0 2501 vperm2i128 \$0x13, $C0, $D0, $B0 2502 vperm2i128 \$0x02, $A1, $B1, $C0 2503 vperm2i128 \$0x02, $C1, $D1, $D0 2504 vperm2i128 \$0x13, $A1, $B1, $A1 2505 vperm2i128 \$0x13, $C1, $D1, $B1 2506 .Lseal_avx2_short: 2507 mov $adl, $itr2 2508 call poly_hash_ad_internal 2509 xor $itr1, $itr1 2510 .Lseal_avx2_short_hash_remainder: 2511 cmp \$16, $itr1 2512 jb .Lseal_avx2_short_loop\n"; 2513 &poly_add("0($oup)"); 2514 &poly_mul(); $code.=" 2515 sub \$16, $itr1 2516 add \$16, $oup 2517 jmp .Lseal_avx2_short_hash_remainder 2518 .Lseal_avx2_short_loop: 2519 cmp \$32, $inl 2520 jb .Lseal_avx2_short_tail 2521 sub \$32, $inl 2522 # Encrypt 2523 vpxor ($inp), $A0, $A0 2524 vmovdqu $A0, ($oup) 2525 lea 1*32($inp), $inp 2526 # Load + hash\n"; 2527 &poly_add("0*8($oup)"); 2528 &poly_mul(); 2529 &poly_add("2*8($oup)"); 2530 &poly_mul(); $code.=" 2531 lea 1*32($oup), $oup 2532 # Shift stream 2533 vmovdqa $B0, $A0 2534 vmovdqa $C0, $B0 2535 vmovdqa $D0, $C0 2536 vmovdqa $A1, $D0 2537 vmovdqa $B1, $A1 2538 vmovdqa $C1, $B1 2539 vmovdqa $D1, $C1 2540 vmovdqa $A2, $D1 2541 vmovdqa $B2, $A2 2542 jmp .Lseal_avx2_short_loop 2543 .Lseal_avx2_short_tail: 2544 cmp \$16, $inl 2545 jb .Lseal_avx2_exit 2546 sub \$16, $inl 2547 vpxor ($inp), $A0x, $A3x 2548 vmovdqu $A3x, ($oup) 2549 lea 1*16($inp), $inp\n"; 2550 &poly_add("0*8($oup)"); 2551 &poly_mul(); $code.=" 2552 lea 1*16($oup), $oup 2553 vextracti128 \$1, $A0, $A0x 2554 .Lseal_avx2_exit: 2555 vzeroupper 2556 jmp .Lseal_sse_tail_16 2557 .cfi_endproc 2558 .size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 2559 "; 2560 } 2561 2562 $code =~ s/\`([^\`]*)\`/eval $1/gem; 2563 2564 print $code; 2565 2566 close STDOUT or die "error closing STDOUT: $!";