git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/asm/chacha20_poly1305_armv8.pl (about) 1 #!/usr/bin/env perl 2 3 # Copyright (c) 2020, CloudFlare Ltd. 4 # 5 # Permission to use, copy, modify, and/or distribute this software for any 6 # purpose with or without fee is hereby granted, provided that the above 7 # copyright notice and this permission notice appear in all copies. 8 # 9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14 # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 16 17 ############################################################################## 18 # # 19 # Author: Vlad Krasnov # 20 # # 21 ############################################################################## 22 23 $flavour = shift; 24 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 25 26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 28 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 29 die "can't locate arm-xlate.pl"; 30 31 open OUT,"| \"$^X\" $xlate $flavour $output"; 32 *STDOUT=*OUT; 33 34 my ($oup,$inp,$inl,$adp,$adl,$keyp,$itr1,$itr2) = ("x0","x1","x2","x3","x4","x5","x6","x7"); 35 my ($acc0,$acc1,$acc2) = map("x$_",(8..10)); 36 my ($t0,$t1,$t2,$t3) = map("x$_",(11..14)); 37 my ($one, $r0, $r1) = ("x15","x16","x17"); 38 my ($t0w) = $t0 =~ s/x/w/r; 39 40 my ($A0,$A1,$A2,$A3,$A4,$B0,$B1,$B2,$B3,$B4,$C0,$C1,$C2,$C3,$C4,$D0,$D1,$D2,$D3,$D4) = map("v$_",(0..19)); 41 my ($T0,$T1,$T2,$T3) = map("v$_",(20..23)); 42 43 my $CONSTS = "v24"; 44 my $INC = "v25"; 45 my $ROL8 = "v26"; 46 my $CLAMP = "v27"; 47 48 my ($B_STORE, $C_STORE, $D_STORE) = map("v$_",(28..30)); 49 50 my $S_STORE = $CLAMP; 51 my $LEN_STORE = "v31"; 52 53 sub chacha_qr { 54 my ($a,$b,$c,$d,$t,$dir)=@_; 55 my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); 56 $code.=<<___; 57 add $a.4s, $a.4s, $b.4s 58 eor $d.16b, $d.16b, $a.16b 59 rev32 $d.8h, $d.8h 60 61 add $c.4s, $c.4s, $d.4s 62 eor $b.16b, $b.16b, $c.16b 63 ushr $t.4s, $b.4s, #20 64 sli $t.4s, $b.4s, #12 65 ___ 66 ($t,$b) = ($b,$t); 67 $code.=<<___; 68 add $a.4s, $a.4s, $b.4s 69 eor $d.16b, $d.16b, $a.16b 70 tbl $d.16b, {$d.16b}, $ROL8.16b 71 72 add $c.4s, $c.4s, $d.4s 73 eor $b.16b, $b.16b, $c.16b 74 ushr $t.4s, $b.4s, #25 75 sli $t.4s, $b.4s, #7 76 ___ 77 ($t,$b) = ($b,$t); 78 $code.=<<___; 79 ext $b.16b, $b.16b, $b.16b, $shift_b 80 ext $c.16b, $c.16b, $c.16b, #8 81 ext $d.16b, $d.16b, $d.16b, $shift_d 82 ___ 83 } 84 85 sub poly_add { 86 my ($src)=@_; 87 $code.="ldp $t0, $t1, [$src], 16 88 adds $acc0, $acc0, $t0 89 adcs $acc1, $acc1, $t1 90 adc $acc2, $acc2, $one\n"; 91 } 92 93 sub poly_add_vec { 94 my ($src)=@_; 95 $code.="mov $t0, $src.d[0] 96 mov $t1, $src.d[1] 97 adds $acc0, $acc0, $t0 98 adcs $acc1, $acc1, $t1 99 adc $acc2, $acc2, $one\n"; 100 } 101 102 sub poly_stage1 { 103 $code.="mul $t0, $acc0, $r0 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 104 umulh $t1, $acc0, $r0 105 mul $t2, $acc1, $r0 106 umulh $t3, $acc1, $r0 107 adds $t1, $t1, $t2 108 mul $t2, $acc2, $r0 109 adc $t2, $t2, $t3\n"; 110 } 111 112 sub poly_stage2 { 113 $code.="mul $t3, $acc0, $r1 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 114 umulh $acc0, $acc0, $r1 115 adds $t1, $t1, $t3 116 mul $t3, $acc1, $r1 117 umulh $acc1, $acc1, $r1 118 adcs $t3, $t3, $acc0 119 mul $acc2, $acc2, $r1 120 adc $acc2, $acc2, $acc1 121 adds $t2, $t2, $t3 122 adc $t3, $acc2, xzr\n"; 123 } 124 125 # At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of 126 # r = [r1:r0] and acc = [acc2:acc1:acc0] 127 # r is 124 bits at most (due to clamping) and acc is 131 bits at most 128 # (acc2 is at most 4 before the addition and can be at most 6 when we add in 129 # the next block) therefore t is at most 255 bits big, and t3 is 63 bits. 130 sub poly_reduce_stage { 131 $code.="and $acc2, $t2, #3 // At this point acc2 is 2 bits at most (value of 3) 132 and $acc0, $t2, #-4 133 extr $t2, $t3, $t2, #2 134 adds $acc0, $acc0, $t0 135 lsr $t0, $t3, #2 136 adc $acc1, $t3, $t0 // No carry out since t0 is 61 bits and t3 is 63 bits 137 adds $acc0, $acc0, $t2 138 adcs $acc1, $acc1, $t1 139 adc $acc2, $acc2, xzr // At this point acc2 has the value of 4 at most \n"; 140 } 141 142 sub poly_mul { 143 &poly_stage1(); 144 &poly_stage2(); 145 &poly_reduce_stage(); 146 } 147 148 sub chacha_qr_x3 { 149 my ($dir)=@_; 150 my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); 151 $code.=<<___; 152 add $A0.4s, $A0.4s, $B0.4s 153 add $A1.4s, $A1.4s, $B1.4s 154 add $A2.4s, $A2.4s, $B2.4s 155 eor $D0.16b, $D0.16b, $A0.16b 156 eor $D1.16b, $D1.16b, $A1.16b 157 eor $D2.16b, $D2.16b, $A2.16b 158 rev32 $D0.8h, $D0.8h 159 rev32 $D1.8h, $D1.8h 160 rev32 $D2.8h, $D2.8h 161 162 add $C0.4s, $C0.4s, $D0.4s 163 add $C1.4s, $C1.4s, $D1.4s 164 add $C2.4s, $C2.4s, $D2.4s 165 eor $B0.16b, $B0.16b, $C0.16b 166 eor $B1.16b, $B1.16b, $C1.16b 167 eor $B2.16b, $B2.16b, $C2.16b 168 ushr $T0.4s, $B0.4s, #20 169 sli $T0.4s, $B0.4s, #12 170 ushr $B0.4s, $B1.4s, #20 171 sli $B0.4s, $B1.4s, #12 172 ushr $B1.4s, $B2.4s, #20 173 sli $B1.4s, $B2.4s, #12 174 175 add $A0.4s, $A0.4s, $T0.4s 176 add $A1.4s, $A1.4s, $B0.4s 177 add $A2.4s, $A2.4s, $B1.4s 178 eor $D0.16b, $D0.16b, $A0.16b 179 eor $D1.16b, $D1.16b, $A1.16b 180 eor $D2.16b, $D2.16b, $A2.16b 181 tbl $D0.16b, {$D0.16b}, $ROL8.16b 182 tbl $D1.16b, {$D1.16b}, $ROL8.16b 183 tbl $D2.16b, {$D2.16b}, $ROL8.16b 184 185 add $C0.4s, $C0.4s, $D0.4s 186 add $C1.4s, $C1.4s, $D1.4s 187 add $C2.4s, $C2.4s, $D2.4s 188 eor $T0.16b, $T0.16b, $C0.16b 189 eor $B0.16b, $B0.16b, $C1.16b 190 eor $B1.16b, $B1.16b, $C2.16b 191 ushr $B2.4s, $B1.4s, #25 192 sli $B2.4s, $B1.4s, #7 193 ushr $B1.4s, $B0.4s, #25 194 sli $B1.4s, $B0.4s, #7 195 ushr $B0.4s, $T0.4s, #25 196 sli $B0.4s, $T0.4s, #7 197 198 ext $B0.16b, $B0.16b, $B0.16b, $shift_b 199 ext $B1.16b, $B1.16b, $B1.16b, $shift_b 200 ext $B2.16b, $B2.16b, $B2.16b, $shift_b 201 202 ext $C0.16b, $C0.16b, $C0.16b, #8 203 ext $C1.16b, $C1.16b, $C1.16b, #8 204 ext $C2.16b, $C2.16b, $C2.16b, #8 205 206 ext $D0.16b, $D0.16b, $D0.16b, $shift_d 207 ext $D1.16b, $D1.16b, $D1.16b, $shift_d 208 ext $D2.16b, $D2.16b, $D2.16b, $shift_d 209 ___ 210 } 211 212 # When preparing 5 ChaCha20 blocks in parallel, we operate on 4 blocks vertically as introduced by Andrew Moon 213 # the fifth block is done horizontally 214 sub chacha_qr_x5 { 215 my ($dir)=@_; 216 my ($a0,$a1,$a2,$a3) = $dir =~ /left/ ? ($A0,$A1,$A2,$A3) : ($A0,$A1,$A2,$A3); 217 my ($b0,$b1,$b2,$b3) = $dir =~ /left/ ? ($B0,$B1,$B2,$B3) : ($B1,$B2,$B3,$B0); 218 my ($c0,$c1,$c2,$c3) = $dir =~ /left/ ? ($C0,$C1,$C2,$C3) : ($C2,$C3,$C0,$C1); 219 my ($d0,$d1,$d2,$d3) = $dir =~ /left/ ? ($D0,$D1,$D2,$D3) : ($D3,$D0,$D1,$D2); 220 my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4"); 221 $code.=<<___; 222 add $a0.4s, $a0.4s, $b0.4s 223 add $a1.4s, $a1.4s, $b1.4s 224 add $a2.4s, $a2.4s, $b2.4s 225 add $a3.4s, $a3.4s, $b3.4s 226 add $A4.4s, $A4.4s, $B4.4s 227 228 eor $d0.16b, $d0.16b, $a0.16b 229 eor $d1.16b, $d1.16b, $a1.16b 230 eor $d2.16b, $d2.16b, $a2.16b 231 eor $d3.16b, $d3.16b, $a3.16b 232 eor $D4.16b, $D4.16b, $A4.16b 233 234 rev32 $d0.8h, $d0.8h 235 rev32 $d1.8h, $d1.8h 236 rev32 $d2.8h, $d2.8h 237 rev32 $d3.8h, $d3.8h 238 rev32 $D4.8h, $D4.8h 239 240 add $c0.4s, $c0.4s, $d0.4s 241 add $c1.4s, $c1.4s, $d1.4s 242 add $c2.4s, $c2.4s, $d2.4s 243 add $c3.4s, $c3.4s, $d3.4s 244 add $C4.4s, $C4.4s, $D4.4s 245 246 eor $b0.16b, $b0.16b, $c0.16b 247 eor $b1.16b, $b1.16b, $c1.16b 248 eor $b2.16b, $b2.16b, $c2.16b 249 eor $b3.16b, $b3.16b, $c3.16b 250 eor $B4.16b, $B4.16b, $C4.16b 251 252 ushr $T0.4s, $b0.4s, #20 253 sli $T0.4s, $b0.4s, #12 254 ushr $b0.4s, $b1.4s, #20 255 sli $b0.4s, $b1.4s, #12 256 ushr $b1.4s, $b2.4s, #20 257 sli $b1.4s, $b2.4s, #12 258 ushr $b2.4s, $b3.4s, #20 259 sli $b2.4s, $b3.4s, #12 260 ushr $b3.4s, $B4.4s, #20 261 sli $b3.4s, $B4.4s, #12 262 263 add $a0.4s, $a0.4s, $T0.4s 264 add $a1.4s, $a1.4s, $b0.4s 265 add $a2.4s, $a2.4s, $b1.4s 266 add $a3.4s, $a3.4s, $b2.4s 267 add $A4.4s, $A4.4s, $b3.4s 268 269 eor $d0.16b, $d0.16b, $a0.16b 270 eor $d1.16b, $d1.16b, $a1.16b 271 eor $d2.16b, $d2.16b, $a2.16b 272 eor $d3.16b, $d3.16b, $a3.16b 273 eor $D4.16b, $D4.16b, $A4.16b 274 275 tbl $d0.16b, {$d0.16b}, $ROL8.16b 276 tbl $d1.16b, {$d1.16b}, $ROL8.16b 277 tbl $d2.16b, {$d2.16b}, $ROL8.16b 278 tbl $d3.16b, {$d3.16b}, $ROL8.16b 279 tbl $D4.16b, {$D4.16b}, $ROL8.16b 280 281 add $c0.4s, $c0.4s, $d0.4s 282 add $c1.4s, $c1.4s, $d1.4s 283 add $c2.4s, $c2.4s, $d2.4s 284 add $c3.4s, $c3.4s, $d3.4s 285 add $C4.4s, $C4.4s, $D4.4s 286 287 eor $T0.16b, $T0.16b, $c0.16b 288 eor $b0.16b, $b0.16b, $c1.16b 289 eor $b1.16b, $b1.16b, $c2.16b 290 eor $b2.16b, $b2.16b, $c3.16b 291 eor $b3.16b, $b3.16b, $C4.16b 292 293 ushr $B4.4s, $b3.4s, #25 294 sli $B4.4s, $b3.4s, #7 295 ushr $b3.4s, $b2.4s, #25 296 sli $b3.4s, $b2.4s, #7 297 ushr $b2.4s, $b1.4s, #25 298 sli $b2.4s, $b1.4s, #7 299 ushr $b1.4s, $b0.4s, #25 300 sli $b1.4s, $b0.4s, #7 301 ushr $b0.4s, $T0.4s, #25 302 sli $b0.4s, $T0.4s, #7 303 304 ext $B4.16b, $B4.16b, $B4.16b, $shift_b 305 ext $C4.16b, $C4.16b, $C4.16b, #8 306 ext $D4.16b, $D4.16b, $D4.16b, $shift_d 307 ___ 308 } 309 310 { 311 $code.=<<___; 312 #include <ring-core/arm_arch.h> 313 .section .rodata 314 315 .align 7 316 .Lchacha20_consts: 317 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 318 .Linc: 319 .long 1,2,3,4 320 .Lrol8: 321 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 322 .Lclamp: 323 .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 324 325 .text 326 327 .type .Lpoly_hash_ad_internal,%function 328 .align 6 329 .Lpoly_hash_ad_internal: 330 .cfi_startproc 331 cbnz $adl, .Lpoly_hash_intro 332 ret 333 334 .Lpoly_hash_intro: 335 cmp $adl, #16 336 b.lt .Lpoly_hash_ad_tail 337 ___ 338 &poly_add($adp); 339 &poly_mul(); 340 $code.=<<___; 341 sub $adl, $adl, #16 342 b .Lpoly_hash_ad_internal 343 344 .Lpoly_hash_ad_tail: 345 cbz $adl, .Lpoly_hash_ad_ret 346 347 eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the AAD 348 sub $adl, $adl, #1 349 350 .Lpoly_hash_tail_16_compose: 351 ext $T0.16b, $T0.16b, $T0.16b, #15 352 ldrb $t0w, [$adp, $adl] 353 mov $T0.b[0], $t0w 354 subs $adl, $adl, #1 355 b.ge .Lpoly_hash_tail_16_compose 356 ___ 357 &poly_add_vec($T0); 358 &poly_mul(); 359 $code.=<<___; 360 361 .Lpoly_hash_ad_ret: 362 ret 363 .cfi_endproc 364 .size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal 365 366 ///////////////////////////////// 367 // 368 // void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); 369 // 370 .globl chacha20_poly1305_seal 371 .type chacha20_poly1305_seal,%function 372 .align 6 373 chacha20_poly1305_seal: 374 AARCH64_SIGN_LINK_REGISTER 375 .cfi_startproc 376 stp x29, x30, [sp, #-80]! 377 .cfi_def_cfa_offset 80 378 .cfi_offset w30, -72 379 .cfi_offset w29, -80 380 mov x29, sp 381 // We probably could do .cfi_def_cfa w29, 80 at this point, but since 382 // we don't actually use the frame pointer like that, it's probably not 383 // worth bothering. 384 stp d8, d9, [sp, #16] 385 stp d10, d11, [sp, #32] 386 stp d12, d13, [sp, #48] 387 stp d14, d15, [sp, #64] 388 .cfi_offset b15, -8 389 .cfi_offset b14, -16 390 .cfi_offset b13, -24 391 .cfi_offset b12, -32 392 .cfi_offset b11, -40 393 .cfi_offset b10, -48 394 .cfi_offset b9, -56 395 .cfi_offset b8, -64 396 397 adrp $t0, :pg_hi21:.Lchacha20_consts 398 add $t0, $t0, :lo12:.Lchacha20_consts 399 400 ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values 401 ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp] 402 403 mov $one, #1 // Prepare the Poly1305 state 404 mov $acc0, #0 405 mov $acc1, #0 406 mov $acc2, #0 407 408 ldr $t1, [$keyp, #56] // The total cipher text length includes extra_in_len 409 add $t1, $t1, $inl 410 mov $LEN_STORE.d[0], $adl // Store the input and aad lengths 411 mov $LEN_STORE.d[1], $t1 412 413 cmp $inl, #128 414 b.le .Lseal_128 // Optimization for smaller buffers 415 416 // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, 417 // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, 418 // the fifth block (A4-D4) horizontally. 419 ld4r {$A0.4s-$A3.4s}, [$t0] 420 mov $A4.16b, $CONSTS.16b 421 422 ld4r {$B0.4s-$B3.4s}, [$keyp], #16 423 mov $B4.16b, $B_STORE.16b 424 425 ld4r {$C0.4s-$C3.4s}, [$keyp], #16 426 mov $C4.16b, $C_STORE.16b 427 428 ld4r {$D0.4s-$D3.4s}, [$keyp] 429 add $D0.4s, $D0.4s, $INC.4s 430 mov $D4.16b, $D_STORE.16b 431 432 sub $keyp, $keyp, #32 433 434 mov $itr1, #10 435 436 .align 5 437 .Lseal_init_rounds: 438 ___ 439 &chacha_qr_x5("left"); 440 &chacha_qr_x5("right"); 441 $code.=<<___; 442 subs $itr1, $itr1, #1 443 b.hi .Lseal_init_rounds 444 445 add $D0.4s, $D0.4s, $INC.4s 446 mov $t0, #4 447 dup $T0.4s, $t0w 448 add $INC.4s, $INC.4s, $T0.4s 449 450 zip1 $T0.4s, $A0.4s, $A1.4s 451 zip2 $T1.4s, $A0.4s, $A1.4s 452 zip1 $T2.4s, $A2.4s, $A3.4s 453 zip2 $T3.4s, $A2.4s, $A3.4s 454 455 zip1 $A0.2d, $T0.2d, $T2.2d 456 zip2 $A1.2d, $T0.2d, $T2.2d 457 zip1 $A2.2d, $T1.2d, $T3.2d 458 zip2 $A3.2d, $T1.2d, $T3.2d 459 460 zip1 $T0.4s, $B0.4s, $B1.4s 461 zip2 $T1.4s, $B0.4s, $B1.4s 462 zip1 $T2.4s, $B2.4s, $B3.4s 463 zip2 $T3.4s, $B2.4s, $B3.4s 464 465 zip1 $B0.2d, $T0.2d, $T2.2d 466 zip2 $B1.2d, $T0.2d, $T2.2d 467 zip1 $B2.2d, $T1.2d, $T3.2d 468 zip2 $B3.2d, $T1.2d, $T3.2d 469 470 zip1 $T0.4s, $C0.4s, $C1.4s 471 zip2 $T1.4s, $C0.4s, $C1.4s 472 zip1 $T2.4s, $C2.4s, $C3.4s 473 zip2 $T3.4s, $C2.4s, $C3.4s 474 475 zip1 $C0.2d, $T0.2d, $T2.2d 476 zip2 $C1.2d, $T0.2d, $T2.2d 477 zip1 $C2.2d, $T1.2d, $T3.2d 478 zip2 $C3.2d, $T1.2d, $T3.2d 479 480 zip1 $T0.4s, $D0.4s, $D1.4s 481 zip2 $T1.4s, $D0.4s, $D1.4s 482 zip1 $T2.4s, $D2.4s, $D3.4s 483 zip2 $T3.4s, $D2.4s, $D3.4s 484 485 zip1 $D0.2d, $T0.2d, $T2.2d 486 zip2 $D1.2d, $T0.2d, $T2.2d 487 zip1 $D2.2d, $T1.2d, $T3.2d 488 zip2 $D3.2d, $T1.2d, $T3.2d 489 490 add $A4.4s, $A4.4s, $CONSTS.4s 491 add $B4.4s, $B4.4s, $B_STORE.4s 492 and $A4.16b, $A4.16b, $CLAMP.16b 493 494 add $A0.4s, $A0.4s, $CONSTS.4s 495 add $B0.4s, $B0.4s, $B_STORE.4s 496 add $C0.4s, $C0.4s, $C_STORE.4s 497 add $D0.4s, $D0.4s, $D_STORE.4s 498 499 add $A1.4s, $A1.4s, $CONSTS.4s 500 add $B1.4s, $B1.4s, $B_STORE.4s 501 add $C1.4s, $C1.4s, $C_STORE.4s 502 add $D1.4s, $D1.4s, $D_STORE.4s 503 504 add $A2.4s, $A2.4s, $CONSTS.4s 505 add $B2.4s, $B2.4s, $B_STORE.4s 506 add $C2.4s, $C2.4s, $C_STORE.4s 507 add $D2.4s, $D2.4s, $D_STORE.4s 508 509 add $A3.4s, $A3.4s, $CONSTS.4s 510 add $B3.4s, $B3.4s, $B_STORE.4s 511 add $C3.4s, $C3.4s, $C_STORE.4s 512 add $D3.4s, $D3.4s, $D_STORE.4s 513 514 mov $r0, $A4.d[0] // Move the R key to GPRs 515 mov $r1, $A4.d[1] 516 mov $S_STORE.16b, $B4.16b // Store the S key 517 518 bl .Lpoly_hash_ad_internal 519 520 mov $adp, $oup 521 cmp $inl, #256 522 b.le .Lseal_tail 523 524 ld1 {$T0.16b - $T3.16b}, [$inp], #64 525 eor $T0.16b, $T0.16b, $A0.16b 526 eor $T1.16b, $T1.16b, $B0.16b 527 eor $T2.16b, $T2.16b, $C0.16b 528 eor $T3.16b, $T3.16b, $D0.16b 529 st1 {$T0.16b - $T3.16b}, [$oup], #64 530 531 ld1 {$T0.16b - $T3.16b}, [$inp], #64 532 eor $T0.16b, $T0.16b, $A1.16b 533 eor $T1.16b, $T1.16b, $B1.16b 534 eor $T2.16b, $T2.16b, $C1.16b 535 eor $T3.16b, $T3.16b, $D1.16b 536 st1 {$T0.16b - $T3.16b}, [$oup], #64 537 538 ld1 {$T0.16b - $T3.16b}, [$inp], #64 539 eor $T0.16b, $T0.16b, $A2.16b 540 eor $T1.16b, $T1.16b, $B2.16b 541 eor $T2.16b, $T2.16b, $C2.16b 542 eor $T3.16b, $T3.16b, $D2.16b 543 st1 {$T0.16b - $T3.16b}, [$oup], #64 544 545 ld1 {$T0.16b - $T3.16b}, [$inp], #64 546 eor $T0.16b, $T0.16b, $A3.16b 547 eor $T1.16b, $T1.16b, $B3.16b 548 eor $T2.16b, $T2.16b, $C3.16b 549 eor $T3.16b, $T3.16b, $D3.16b 550 st1 {$T0.16b - $T3.16b}, [$oup], #64 551 552 sub $inl, $inl, #256 553 554 mov $itr1, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds 555 mov $itr2, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 556 557 .Lseal_main_loop: 558 adrp $t0, :pg_hi21:.Lchacha20_consts 559 add $t0, $t0, :lo12:.Lchacha20_consts 560 561 ld4r {$A0.4s-$A3.4s}, [$t0] 562 mov $A4.16b, $CONSTS.16b 563 564 ld4r {$B0.4s-$B3.4s}, [$keyp], #16 565 mov $B4.16b, $B_STORE.16b 566 567 ld4r {$C0.4s-$C3.4s}, [$keyp], #16 568 mov $C4.16b, $C_STORE.16b 569 570 ld4r {$D0.4s-$D3.4s}, [$keyp] 571 add $D0.4s, $D0.4s, $INC.4s 572 mov $D4.16b, $D_STORE.16b 573 574 eor $T0.16b, $T0.16b, $T0.16b //zero 575 not $T1.16b, $T0.16b // -1 576 sub $T1.4s, $INC.4s, $T1.4s // Add +1 577 ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) 578 add $D4.4s, $D4.4s, $T0.4s 579 580 sub $keyp, $keyp, #32 581 .align 5 582 .Lseal_main_loop_rounds: 583 ___ 584 &chacha_qr_x5("left"); 585 &poly_add($adp); 586 &poly_mul(); 587 &chacha_qr_x5("right"); 588 $code.=<<___; 589 subs $itr1, $itr1, #1 590 b.ge .Lseal_main_loop_rounds 591 ___ 592 &poly_add($adp); 593 &poly_mul(); 594 $code.=<<___; 595 subs $itr2, $itr2, #1 596 b.gt .Lseal_main_loop_rounds 597 598 eor $T0.16b, $T0.16b, $T0.16b //zero 599 not $T1.16b, $T0.16b // -1 600 sub $T1.4s, $INC.4s, $T1.4s // Add +1 601 ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) 602 add $D4.4s, $D4.4s, $T0.4s 603 604 add $D0.4s, $D0.4s, $INC.4s 605 mov $t0, #5 606 dup $T0.4s, $t0w 607 add $INC.4s, $INC.4s, $T0.4s 608 609 zip1 $T0.4s, $A0.4s, $A1.4s 610 zip2 $T1.4s, $A0.4s, $A1.4s 611 zip1 $T2.4s, $A2.4s, $A3.4s 612 zip2 $T3.4s, $A2.4s, $A3.4s 613 614 zip1 $A0.2d, $T0.2d, $T2.2d 615 zip2 $A1.2d, $T0.2d, $T2.2d 616 zip1 $A2.2d, $T1.2d, $T3.2d 617 zip2 $A3.2d, $T1.2d, $T3.2d 618 619 zip1 $T0.4s, $B0.4s, $B1.4s 620 zip2 $T1.4s, $B0.4s, $B1.4s 621 zip1 $T2.4s, $B2.4s, $B3.4s 622 zip2 $T3.4s, $B2.4s, $B3.4s 623 624 zip1 $B0.2d, $T0.2d, $T2.2d 625 zip2 $B1.2d, $T0.2d, $T2.2d 626 zip1 $B2.2d, $T1.2d, $T3.2d 627 zip2 $B3.2d, $T1.2d, $T3.2d 628 629 zip1 $T0.4s, $C0.4s, $C1.4s 630 zip2 $T1.4s, $C0.4s, $C1.4s 631 zip1 $T2.4s, $C2.4s, $C3.4s 632 zip2 $T3.4s, $C2.4s, $C3.4s 633 634 zip1 $C0.2d, $T0.2d, $T2.2d 635 zip2 $C1.2d, $T0.2d, $T2.2d 636 zip1 $C2.2d, $T1.2d, $T3.2d 637 zip2 $C3.2d, $T1.2d, $T3.2d 638 639 zip1 $T0.4s, $D0.4s, $D1.4s 640 zip2 $T1.4s, $D0.4s, $D1.4s 641 zip1 $T2.4s, $D2.4s, $D3.4s 642 zip2 $T3.4s, $D2.4s, $D3.4s 643 644 zip1 $D0.2d, $T0.2d, $T2.2d 645 zip2 $D1.2d, $T0.2d, $T2.2d 646 zip1 $D2.2d, $T1.2d, $T3.2d 647 zip2 $D3.2d, $T1.2d, $T3.2d 648 649 add $A0.4s, $A0.4s, $CONSTS.4s 650 add $B0.4s, $B0.4s, $B_STORE.4s 651 add $C0.4s, $C0.4s, $C_STORE.4s 652 add $D0.4s, $D0.4s, $D_STORE.4s 653 654 add $A1.4s, $A1.4s, $CONSTS.4s 655 add $B1.4s, $B1.4s, $B_STORE.4s 656 add $C1.4s, $C1.4s, $C_STORE.4s 657 add $D1.4s, $D1.4s, $D_STORE.4s 658 659 add $A2.4s, $A2.4s, $CONSTS.4s 660 add $B2.4s, $B2.4s, $B_STORE.4s 661 add $C2.4s, $C2.4s, $C_STORE.4s 662 add $D2.4s, $D2.4s, $D_STORE.4s 663 664 add $A3.4s, $A3.4s, $CONSTS.4s 665 add $B3.4s, $B3.4s, $B_STORE.4s 666 add $C3.4s, $C3.4s, $C_STORE.4s 667 add $D3.4s, $D3.4s, $D_STORE.4s 668 669 add $A4.4s, $A4.4s, $CONSTS.4s 670 add $B4.4s, $B4.4s, $B_STORE.4s 671 add $C4.4s, $C4.4s, $C_STORE.4s 672 add $D4.4s, $D4.4s, $D_STORE.4s 673 674 cmp $inl, #320 675 b.le .Lseal_tail 676 677 ld1 {$T0.16b - $T3.16b}, [$inp], #64 678 eor $T0.16b, $T0.16b, $A0.16b 679 eor $T1.16b, $T1.16b, $B0.16b 680 eor $T2.16b, $T2.16b, $C0.16b 681 eor $T3.16b, $T3.16b, $D0.16b 682 st1 {$T0.16b - $T3.16b}, [$oup], #64 683 684 ld1 {$T0.16b - $T3.16b}, [$inp], #64 685 eor $T0.16b, $T0.16b, $A1.16b 686 eor $T1.16b, $T1.16b, $B1.16b 687 eor $T2.16b, $T2.16b, $C1.16b 688 eor $T3.16b, $T3.16b, $D1.16b 689 st1 {$T0.16b - $T3.16b}, [$oup], #64 690 691 ld1 {$T0.16b - $T3.16b}, [$inp], #64 692 eor $T0.16b, $T0.16b, $A2.16b 693 eor $T1.16b, $T1.16b, $B2.16b 694 eor $T2.16b, $T2.16b, $C2.16b 695 eor $T3.16b, $T3.16b, $D2.16b 696 st1 {$T0.16b - $T3.16b}, [$oup], #64 697 698 ld1 {$T0.16b - $T3.16b}, [$inp], #64 699 eor $T0.16b, $T0.16b, $A3.16b 700 eor $T1.16b, $T1.16b, $B3.16b 701 eor $T2.16b, $T2.16b, $C3.16b 702 eor $T3.16b, $T3.16b, $D3.16b 703 st1 {$T0.16b - $T3.16b}, [$oup], #64 704 705 ld1 {$T0.16b - $T3.16b}, [$inp], #64 706 eor $T0.16b, $T0.16b, $A4.16b 707 eor $T1.16b, $T1.16b, $B4.16b 708 eor $T2.16b, $T2.16b, $C4.16b 709 eor $T3.16b, $T3.16b, $D4.16b 710 st1 {$T0.16b - $T3.16b}, [$oup], #64 711 712 sub $inl, $inl, #320 713 714 mov $itr1, #0 715 mov $itr2, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration 716 717 b .Lseal_main_loop 718 719 .Lseal_tail: 720 // This part of the function handles the storage and authentication of the last [0,320) bytes 721 // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. 722 cmp $inl, #64 723 b.lt .Lseal_tail_64 724 725 // Store and authenticate 64B blocks per iteration 726 ld1 {$T0.16b - $T3.16b}, [$inp], #64 727 728 eor $T0.16b, $T0.16b, $A0.16b 729 eor $T1.16b, $T1.16b, $B0.16b 730 eor $T2.16b, $T2.16b, $C0.16b 731 eor $T3.16b, $T3.16b, $D0.16b 732 ___ 733 &poly_add_vec($T0); 734 &poly_mul(); 735 &poly_add_vec($T1); 736 &poly_mul(); 737 &poly_add_vec($T2); 738 &poly_mul(); 739 &poly_add_vec($T3); 740 &poly_mul(); 741 $code.=<<___; 742 st1 {$T0.16b - $T3.16b}, [$oup], #64 743 sub $inl, $inl, #64 744 745 // Shift the state left by 64 bytes for the next iteration of the loop 746 mov $A0.16b, $A1.16b 747 mov $B0.16b, $B1.16b 748 mov $C0.16b, $C1.16b 749 mov $D0.16b, $D1.16b 750 751 mov $A1.16b, $A2.16b 752 mov $B1.16b, $B2.16b 753 mov $C1.16b, $C2.16b 754 mov $D1.16b, $D2.16b 755 756 mov $A2.16b, $A3.16b 757 mov $B2.16b, $B3.16b 758 mov $C2.16b, $C3.16b 759 mov $D2.16b, $D3.16b 760 761 mov $A3.16b, $A4.16b 762 mov $B3.16b, $B4.16b 763 mov $C3.16b, $C4.16b 764 mov $D3.16b, $D4.16b 765 766 b .Lseal_tail 767 768 .Lseal_tail_64: 769 ldp $adp, $adl, [$keyp, #48] // extra_in_len and extra_in_ptr 770 771 // Here we handle the last [0,64) bytes of plaintext 772 cmp $inl, #16 773 b.lt .Lseal_tail_16 774 // Each iteration encrypt and authenticate a 16B block 775 ld1 {$T0.16b}, [$inp], #16 776 eor $T0.16b, $T0.16b, $A0.16b 777 ___ 778 &poly_add_vec($T0); 779 &poly_mul(); 780 $code.=<<___; 781 st1 {$T0.16b}, [$oup], #16 782 783 sub $inl, $inl, #16 784 785 // Shift the state left by 16 bytes for the next iteration of the loop 786 mov $A0.16b, $B0.16b 787 mov $B0.16b, $C0.16b 788 mov $C0.16b, $D0.16b 789 790 b .Lseal_tail_64 791 792 .Lseal_tail_16: 793 // Here we handle the last [0,16) bytes of ciphertext that require a padded block 794 cbz $inl, .Lseal_hash_extra 795 796 eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the plaintext/extra in 797 eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes 798 not $T2.16b, $T0.16b 799 800 mov $itr1, $inl 801 add $inp, $inp, $inl 802 803 cbz $adl, .Lseal_tail_16_compose // No extra data to pad with, zero padding 804 805 mov $itr2, #16 // We need to load some extra_in first for padding 806 sub $itr2, $itr2, $inl 807 cmp $adl, $itr2 808 csel $itr2, $adl, $itr2, lt // Load the minimum of extra_in_len and the amount needed to fill the register 809 mov $t1, $itr2 810 add $adp, $adp, $itr2 811 sub $adl, $adl, $itr2 812 813 .Lseal_tail16_compose_extra_in: 814 ext $T0.16b, $T0.16b, $T0.16b, #15 815 ldrb $t0w, [$adp, #-1]! 816 mov $T0.b[0], $t0w 817 subs $itr2, $itr2, #1 818 b.gt .Lseal_tail16_compose_extra_in 819 820 add $adp, $adp, $t1 821 822 .Lseal_tail_16_compose: 823 ext $T0.16b, $T0.16b, $T0.16b, #15 824 ldrb $t0w, [$inp, #-1]! 825 mov $T0.b[0], $t0w 826 ext $T1.16b, $T2.16b, $T1.16b, #15 827 subs $inl, $inl, #1 828 b.gt .Lseal_tail_16_compose 829 830 and $A0.16b, $A0.16b, $T1.16b 831 eor $T0.16b, $T0.16b, $A0.16b 832 mov $T1.16b, $T0.16b 833 834 .Lseal_tail_16_store: 835 umov $t0w, $T0.b[0] 836 strb $t0w, [$oup], #1 837 ext $T0.16b, $T0.16b, $T0.16b, #1 838 subs $itr1, $itr1, #1 839 b.gt .Lseal_tail_16_store 840 841 // Hash in the final ct block concatenated with extra_in 842 ___ 843 &poly_add_vec($T1); 844 &poly_mul(); 845 $code.=<<___; 846 847 .Lseal_hash_extra: 848 cbz $adl, .Lseal_finalize 849 850 .Lseal_hash_extra_loop: 851 cmp $adl, #16 852 b.lt .Lseal_hash_extra_tail 853 ld1 {$T0.16b}, [$adp], #16 854 ___ 855 &poly_add_vec($T0); 856 &poly_mul(); 857 $code.=<<___; 858 sub $adl, $adl, #16 859 b .Lseal_hash_extra_loop 860 861 .Lseal_hash_extra_tail: 862 cbz $adl, .Lseal_finalize 863 eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the remaining extra ciphertext 864 add $adp, $adp, $adl 865 866 .Lseal_hash_extra_load: 867 ext $T0.16b, $T0.16b, $T0.16b, #15 868 ldrb $t0w, [$adp, #-1]! 869 mov $T0.b[0], $t0w 870 subs $adl, $adl, #1 871 b.gt .Lseal_hash_extra_load 872 873 // Hash in the final padded extra_in blcok 874 ___ 875 &poly_add_vec($T0); 876 &poly_mul(); 877 $code.=<<___; 878 879 .Lseal_finalize: 880 ___ 881 &poly_add_vec($LEN_STORE); 882 &poly_mul(); 883 $code.=<<___; 884 // Final reduction step 885 sub $t1, xzr, $one 886 orr $t2, xzr, #3 887 subs $t0, $acc0, #-5 888 sbcs $t1, $acc1, $t1 889 sbcs $t2, $acc2, $t2 890 csel $acc0, $t0, $acc0, cs 891 csel $acc1, $t1, $acc1, cs 892 csel $acc2, $t2, $acc2, cs 893 ___ 894 &poly_add_vec($S_STORE); 895 $code.=<<___; 896 897 stp $acc0, $acc1, [$keyp] 898 899 ldp d8, d9, [sp, #16] 900 ldp d10, d11, [sp, #32] 901 ldp d12, d13, [sp, #48] 902 ldp d14, d15, [sp, #64] 903 .cfi_restore b15 904 .cfi_restore b14 905 .cfi_restore b13 906 .cfi_restore b12 907 .cfi_restore b11 908 .cfi_restore b10 909 .cfi_restore b9 910 .cfi_restore b8 911 ldp x29, x30, [sp], 80 912 .cfi_restore w29 913 .cfi_restore w30 914 .cfi_def_cfa_offset 0 915 AARCH64_VALIDATE_LINK_REGISTER 916 ret 917 918 .Lseal_128: 919 // On some architectures preparing 5 blocks for small buffers is wasteful 920 eor $INC.16b, $INC.16b, $INC.16b 921 mov $t0, #1 922 mov $INC.s[0], $t0w 923 mov $A0.16b, $CONSTS.16b 924 mov $A1.16b, $CONSTS.16b 925 mov $A2.16b, $CONSTS.16b 926 mov $B0.16b, $B_STORE.16b 927 mov $B1.16b, $B_STORE.16b 928 mov $B2.16b, $B_STORE.16b 929 mov $C0.16b, $C_STORE.16b 930 mov $C1.16b, $C_STORE.16b 931 mov $C2.16b, $C_STORE.16b 932 mov $D2.16b, $D_STORE.16b 933 add $D0.4s, $D2.4s, $INC.4s 934 add $D1.4s, $D0.4s, $INC.4s 935 936 mov $itr1, #10 937 938 .Lseal_128_rounds: 939 ___ 940 &chacha_qr_x3("left"); 941 &chacha_qr_x3("right"); 942 $code.=<<___; 943 subs $itr1, $itr1, #1 944 b.hi .Lseal_128_rounds 945 946 add $A0.4s, $A0.4s, $CONSTS.4s 947 add $A1.4s, $A1.4s, $CONSTS.4s 948 add $A2.4s, $A2.4s, $CONSTS.4s 949 950 add $B0.4s, $B0.4s, $B_STORE.4s 951 add $B1.4s, $B1.4s, $B_STORE.4s 952 add $B2.4s, $B2.4s, $B_STORE.4s 953 954 // Only the first 32 bytes of the third block (counter = 0) are needed, 955 // so skip updating $C2 and $D2. 956 add $C0.4s, $C0.4s, $C_STORE.4s 957 add $C1.4s, $C1.4s, $C_STORE.4s 958 959 add $D_STORE.4s, $D_STORE.4s, $INC.4s 960 add $D0.4s, $D0.4s, $D_STORE.4s 961 add $D_STORE.4s, $D_STORE.4s, $INC.4s 962 add $D1.4s, $D1.4s, $D_STORE.4s 963 964 and $A2.16b, $A2.16b, $CLAMP.16b 965 mov $r0, $A2.d[0] // Move the R key to GPRs 966 mov $r1, $A2.d[1] 967 mov $S_STORE.16b, $B2.16b // Store the S key 968 969 bl .Lpoly_hash_ad_internal 970 b .Lseal_tail 971 .cfi_endproc 972 .size chacha20_poly1305_seal,.-chacha20_poly1305_seal 973 974 ///////////////////////////////// 975 // 976 // void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); 977 // 978 .globl chacha20_poly1305_open 979 .type chacha20_poly1305_open,%function 980 .align 6 981 chacha20_poly1305_open: 982 AARCH64_SIGN_LINK_REGISTER 983 .cfi_startproc 984 stp x29, x30, [sp, #-80]! 985 .cfi_def_cfa_offset 80 986 .cfi_offset w30, -72 987 .cfi_offset w29, -80 988 mov x29, sp 989 // We probably could do .cfi_def_cfa w29, 80 at this point, but since 990 // we don't actually use the frame pointer like that, it's probably not 991 // worth bothering. 992 stp d8, d9, [sp, #16] 993 stp d10, d11, [sp, #32] 994 stp d12, d13, [sp, #48] 995 stp d14, d15, [sp, #64] 996 .cfi_offset b15, -8 997 .cfi_offset b14, -16 998 .cfi_offset b13, -24 999 .cfi_offset b12, -32 1000 .cfi_offset b11, -40 1001 .cfi_offset b10, -48 1002 .cfi_offset b9, -56 1003 .cfi_offset b8, -64 1004 1005 adrp $t0, :pg_hi21:.Lchacha20_consts 1006 add $t0, $t0, :lo12:.Lchacha20_consts 1007 1008 ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values 1009 ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp] 1010 1011 mov $one, #1 // Prepare the Poly1305 state 1012 mov $acc0, #0 1013 mov $acc1, #0 1014 mov $acc2, #0 1015 1016 mov $LEN_STORE.d[0], $adl // Store the input and aad lengths 1017 mov $LEN_STORE.d[1], $inl 1018 1019 cmp $inl, #128 1020 b.le .Lopen_128 // Optimization for smaller buffers 1021 1022 // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys 1023 mov $A0.16b, $CONSTS.16b 1024 mov $B0.16b, $B_STORE.16b 1025 mov $C0.16b, $C_STORE.16b 1026 mov $D0.16b, $D_STORE.16b 1027 1028 mov $itr1, #10 1029 1030 .align 5 1031 .Lopen_init_rounds: 1032 ___ 1033 &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); 1034 &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); 1035 $code.=<<___; 1036 subs $itr1, $itr1, #1 1037 b.hi .Lopen_init_rounds 1038 1039 add $A0.4s, $A0.4s, $CONSTS.4s 1040 add $B0.4s, $B0.4s, $B_STORE.4s 1041 1042 and $A0.16b, $A0.16b, $CLAMP.16b 1043 mov $r0, $A0.d[0] // Move the R key to GPRs 1044 mov $r1, $A0.d[1] 1045 mov $S_STORE.16b, $B0.16b // Store the S key 1046 1047 bl .Lpoly_hash_ad_internal 1048 1049 .Lopen_ad_done: 1050 mov $adp, $inp 1051 1052 // Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes 1053 .Lopen_main_loop: 1054 1055 cmp $inl, #192 1056 b.lt .Lopen_tail 1057 1058 adrp $t0, :pg_hi21:.Lchacha20_consts 1059 add $t0, $t0, :lo12:.Lchacha20_consts 1060 1061 ld4r {$A0.4s-$A3.4s}, [$t0] 1062 mov $A4.16b, $CONSTS.16b 1063 1064 ld4r {$B0.4s-$B3.4s}, [$keyp], #16 1065 mov $B4.16b, $B_STORE.16b 1066 1067 ld4r {$C0.4s-$C3.4s}, [$keyp], #16 1068 mov $C4.16b, $C_STORE.16b 1069 1070 ld4r {$D0.4s-$D3.4s}, [$keyp] 1071 sub $keyp, $keyp, #32 1072 add $D0.4s, $D0.4s, $INC.4s 1073 mov $D4.16b, $D_STORE.16b 1074 1075 eor $T0.16b, $T0.16b, $T0.16b //zero 1076 not $T1.16b, $T0.16b // -1 1077 sub $T1.4s, $INC.4s, $T1.4s // Add +1 1078 ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) 1079 add $D4.4s, $D4.4s, $T0.4s 1080 1081 lsr $adl, $inl, #4 // How many whole blocks we have to hash, will always be at least 12 1082 sub $adl, $adl, #10 1083 1084 mov $itr2, #10 1085 subs $itr1, $itr2, $adl 1086 subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 320 bytes to hash 1087 csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full 1088 1089 cbz $itr2, .Lopen_main_loop_rounds_short 1090 1091 .align 5 1092 .Lopen_main_loop_rounds: 1093 ___ 1094 &poly_add($adp); 1095 &poly_mul(); 1096 $code.=<<___; 1097 .Lopen_main_loop_rounds_short: 1098 ___ 1099 &chacha_qr_x5("left"); 1100 &poly_add($adp); 1101 &poly_mul(); 1102 &chacha_qr_x5("right"); 1103 $code.=<<___; 1104 subs $itr2, $itr2, #1 1105 b.gt .Lopen_main_loop_rounds 1106 subs $itr1, $itr1, #1 1107 b.ge .Lopen_main_loop_rounds_short 1108 ___ 1109 $code.=<<___; 1110 1111 eor $T0.16b, $T0.16b, $T0.16b //zero 1112 not $T1.16b, $T0.16b // -1 1113 sub $T1.4s, $INC.4s, $T1.4s // Add +1 1114 ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter) 1115 add $D4.4s, $D4.4s, $T0.4s 1116 1117 add $D0.4s, $D0.4s, $INC.4s 1118 mov $t0, #5 1119 dup $T0.4s, $t0w 1120 add $INC.4s, $INC.4s, $T0.4s 1121 1122 zip1 $T0.4s, $A0.4s, $A1.4s 1123 zip2 $T1.4s, $A0.4s, $A1.4s 1124 zip1 $T2.4s, $A2.4s, $A3.4s 1125 zip2 $T3.4s, $A2.4s, $A3.4s 1126 1127 zip1 $A0.2d, $T0.2d, $T2.2d 1128 zip2 $A1.2d, $T0.2d, $T2.2d 1129 zip1 $A2.2d, $T1.2d, $T3.2d 1130 zip2 $A3.2d, $T1.2d, $T3.2d 1131 1132 zip1 $T0.4s, $B0.4s, $B1.4s 1133 zip2 $T1.4s, $B0.4s, $B1.4s 1134 zip1 $T2.4s, $B2.4s, $B3.4s 1135 zip2 $T3.4s, $B2.4s, $B3.4s 1136 1137 zip1 $B0.2d, $T0.2d, $T2.2d 1138 zip2 $B1.2d, $T0.2d, $T2.2d 1139 zip1 $B2.2d, $T1.2d, $T3.2d 1140 zip2 $B3.2d, $T1.2d, $T3.2d 1141 1142 zip1 $T0.4s, $C0.4s, $C1.4s 1143 zip2 $T1.4s, $C0.4s, $C1.4s 1144 zip1 $T2.4s, $C2.4s, $C3.4s 1145 zip2 $T3.4s, $C2.4s, $C3.4s 1146 1147 zip1 $C0.2d, $T0.2d, $T2.2d 1148 zip2 $C1.2d, $T0.2d, $T2.2d 1149 zip1 $C2.2d, $T1.2d, $T3.2d 1150 zip2 $C3.2d, $T1.2d, $T3.2d 1151 1152 zip1 $T0.4s, $D0.4s, $D1.4s 1153 zip2 $T1.4s, $D0.4s, $D1.4s 1154 zip1 $T2.4s, $D2.4s, $D3.4s 1155 zip2 $T3.4s, $D2.4s, $D3.4s 1156 1157 zip1 $D0.2d, $T0.2d, $T2.2d 1158 zip2 $D1.2d, $T0.2d, $T2.2d 1159 zip1 $D2.2d, $T1.2d, $T3.2d 1160 zip2 $D3.2d, $T1.2d, $T3.2d 1161 1162 add $A0.4s, $A0.4s, $CONSTS.4s 1163 add $B0.4s, $B0.4s, $B_STORE.4s 1164 add $C0.4s, $C0.4s, $C_STORE.4s 1165 add $D0.4s, $D0.4s, $D_STORE.4s 1166 1167 add $A1.4s, $A1.4s, $CONSTS.4s 1168 add $B1.4s, $B1.4s, $B_STORE.4s 1169 add $C1.4s, $C1.4s, $C_STORE.4s 1170 add $D1.4s, $D1.4s, $D_STORE.4s 1171 1172 add $A2.4s, $A2.4s, $CONSTS.4s 1173 add $B2.4s, $B2.4s, $B_STORE.4s 1174 add $C2.4s, $C2.4s, $C_STORE.4s 1175 add $D2.4s, $D2.4s, $D_STORE.4s 1176 1177 add $A3.4s, $A3.4s, $CONSTS.4s 1178 add $B3.4s, $B3.4s, $B_STORE.4s 1179 add $C3.4s, $C3.4s, $C_STORE.4s 1180 add $D3.4s, $D3.4s, $D_STORE.4s 1181 1182 add $A4.4s, $A4.4s, $CONSTS.4s 1183 add $B4.4s, $B4.4s, $B_STORE.4s 1184 add $C4.4s, $C4.4s, $C_STORE.4s 1185 add $D4.4s, $D4.4s, $D_STORE.4s 1186 1187 // We can always safely store 192 bytes 1188 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1189 eor $T0.16b, $T0.16b, $A0.16b 1190 eor $T1.16b, $T1.16b, $B0.16b 1191 eor $T2.16b, $T2.16b, $C0.16b 1192 eor $T3.16b, $T3.16b, $D0.16b 1193 st1 {$T0.16b - $T3.16b}, [$oup], #64 1194 1195 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1196 eor $T0.16b, $T0.16b, $A1.16b 1197 eor $T1.16b, $T1.16b, $B1.16b 1198 eor $T2.16b, $T2.16b, $C1.16b 1199 eor $T3.16b, $T3.16b, $D1.16b 1200 st1 {$T0.16b - $T3.16b}, [$oup], #64 1201 1202 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1203 eor $T0.16b, $T0.16b, $A2.16b 1204 eor $T1.16b, $T1.16b, $B2.16b 1205 eor $T2.16b, $T2.16b, $C2.16b 1206 eor $T3.16b, $T3.16b, $D2.16b 1207 st1 {$T0.16b - $T3.16b}, [$oup], #64 1208 1209 sub $inl, $inl, #192 1210 1211 mov $A0.16b, $A3.16b 1212 mov $B0.16b, $B3.16b 1213 mov $C0.16b, $C3.16b 1214 mov $D0.16b, $D3.16b 1215 1216 cmp $inl, #64 1217 b.lt .Lopen_tail_64_store 1218 1219 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1220 eor $T0.16b, $T0.16b, $A3.16b 1221 eor $T1.16b, $T1.16b, $B3.16b 1222 eor $T2.16b, $T2.16b, $C3.16b 1223 eor $T3.16b, $T3.16b, $D3.16b 1224 st1 {$T0.16b - $T3.16b}, [$oup], #64 1225 1226 sub $inl, $inl, #64 1227 1228 mov $A0.16b, $A4.16b 1229 mov $B0.16b, $B4.16b 1230 mov $C0.16b, $C4.16b 1231 mov $D0.16b, $D4.16b 1232 1233 cmp $inl, #64 1234 b.lt .Lopen_tail_64_store 1235 1236 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1237 eor $T0.16b, $T0.16b, $A4.16b 1238 eor $T1.16b, $T1.16b, $B4.16b 1239 eor $T2.16b, $T2.16b, $C4.16b 1240 eor $T3.16b, $T3.16b, $D4.16b 1241 st1 {$T0.16b - $T3.16b}, [$oup], #64 1242 1243 sub $inl, $inl, #64 1244 b .Lopen_main_loop 1245 1246 .Lopen_tail: 1247 1248 cbz $inl, .Lopen_finalize 1249 1250 lsr $adl, $inl, #4 // How many whole blocks we have to hash 1251 1252 cmp $inl, #64 1253 b.le .Lopen_tail_64 1254 cmp $inl, #128 1255 b.le .Lopen_tail_128 1256 1257 .Lopen_tail_192: 1258 // We need three more blocks 1259 mov $A0.16b, $CONSTS.16b 1260 mov $A1.16b, $CONSTS.16b 1261 mov $A2.16b, $CONSTS.16b 1262 mov $B0.16b, $B_STORE.16b 1263 mov $B1.16b, $B_STORE.16b 1264 mov $B2.16b, $B_STORE.16b 1265 mov $C0.16b, $C_STORE.16b 1266 mov $C1.16b, $C_STORE.16b 1267 mov $C2.16b, $C_STORE.16b 1268 mov $D0.16b, $D_STORE.16b 1269 mov $D1.16b, $D_STORE.16b 1270 mov $D2.16b, $D_STORE.16b 1271 eor $T3.16b, $T3.16b, $T3.16b 1272 eor $T1.16b, $T1.16b, $T1.16b 1273 ins $T3.s[0], $INC.s[0] 1274 ins $T1.d[0], $one 1275 1276 add $T2.4s, $T3.4s, $T1.4s 1277 add $T1.4s, $T2.4s, $T1.4s 1278 1279 add $D0.4s, $D0.4s, $T1.4s 1280 add $D1.4s, $D1.4s, $T3.4s 1281 add $D2.4s, $D2.4s, $T2.4s 1282 1283 mov $itr2, #10 1284 subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 160 bytes to hash 1285 csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing 1286 sub $adl, $adl, $itr2 1287 1288 cbz $itr2, .Lopen_tail_192_rounds_no_hash 1289 1290 .Lopen_tail_192_rounds: 1291 ___ 1292 &poly_add($adp); 1293 &poly_mul(); 1294 $code.=<<___; 1295 .Lopen_tail_192_rounds_no_hash: 1296 ___ 1297 &chacha_qr_x3("left"); 1298 &chacha_qr_x3("right"); 1299 $code.=<<___; 1300 subs $itr2, $itr2, #1 1301 b.gt .Lopen_tail_192_rounds 1302 subs $itr1, $itr1, #1 1303 b.ge .Lopen_tail_192_rounds_no_hash 1304 1305 // We hashed 160 bytes at most, may still have 32 bytes left 1306 .Lopen_tail_192_hash: 1307 cbz $adl, .Lopen_tail_192_hash_done 1308 ___ 1309 &poly_add($adp); 1310 &poly_mul(); 1311 $code.=<<___; 1312 sub $adl, $adl, #1 1313 b .Lopen_tail_192_hash 1314 1315 .Lopen_tail_192_hash_done: 1316 1317 add $A0.4s, $A0.4s, $CONSTS.4s 1318 add $A1.4s, $A1.4s, $CONSTS.4s 1319 add $A2.4s, $A2.4s, $CONSTS.4s 1320 add $B0.4s, $B0.4s, $B_STORE.4s 1321 add $B1.4s, $B1.4s, $B_STORE.4s 1322 add $B2.4s, $B2.4s, $B_STORE.4s 1323 add $C0.4s, $C0.4s, $C_STORE.4s 1324 add $C1.4s, $C1.4s, $C_STORE.4s 1325 add $C2.4s, $C2.4s, $C_STORE.4s 1326 add $D0.4s, $D0.4s, $D_STORE.4s 1327 add $D1.4s, $D1.4s, $D_STORE.4s 1328 add $D2.4s, $D2.4s, $D_STORE.4s 1329 1330 add $D0.4s, $D0.4s, $T1.4s 1331 add $D1.4s, $D1.4s, $T3.4s 1332 add $D2.4s, $D2.4s, $T2.4s 1333 1334 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1335 1336 eor $T0.16b, $T0.16b, $A1.16b 1337 eor $T1.16b, $T1.16b, $B1.16b 1338 eor $T2.16b, $T2.16b, $C1.16b 1339 eor $T3.16b, $T3.16b, $D1.16b 1340 1341 st1 {$T0.16b - $T3.16b}, [$oup], #64 1342 1343 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1344 1345 eor $T0.16b, $T0.16b, $A2.16b 1346 eor $T1.16b, $T1.16b, $B2.16b 1347 eor $T2.16b, $T2.16b, $C2.16b 1348 eor $T3.16b, $T3.16b, $D2.16b 1349 1350 st1 {$T0.16b - $T3.16b}, [$oup], #64 1351 1352 sub $inl, $inl, #128 1353 b .Lopen_tail_64_store 1354 1355 .Lopen_tail_128: 1356 // We need two more blocks 1357 mov $A0.16b, $CONSTS.16b 1358 mov $A1.16b, $CONSTS.16b 1359 mov $B0.16b, $B_STORE.16b 1360 mov $B1.16b, $B_STORE.16b 1361 mov $C0.16b, $C_STORE.16b 1362 mov $C1.16b, $C_STORE.16b 1363 mov $D0.16b, $D_STORE.16b 1364 mov $D1.16b, $D_STORE.16b 1365 eor $T3.16b, $T3.16b, $T3.16b 1366 eor $T2.16b, $T2.16b, $T2.16b 1367 ins $T3.s[0], $INC.s[0] 1368 ins $T2.d[0], $one 1369 add $T2.4s, $T2.4s, $T3.4s 1370 1371 add $D0.4s, $D0.4s, $T2.4s 1372 add $D1.4s, $D1.4s, $T3.4s 1373 1374 mov $itr1, #10 1375 sub $itr1, $itr1, $adl 1376 1377 .Lopen_tail_128_rounds: 1378 ___ 1379 &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); 1380 &chacha_qr($A1, $B1, $C1, $D1, $T0, "left"); 1381 &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); 1382 &chacha_qr($A1, $B1, $C1, $D1, $T0, "right"); 1383 $code.=<<___; 1384 subs $itr1, $itr1, #1 1385 b.gt .Lopen_tail_128_rounds 1386 cbz $adl, .Lopen_tail_128_rounds_done 1387 subs $adl, $adl, #1 1388 ___ 1389 &poly_add($adp); 1390 &poly_mul(); 1391 $code.=<<___; 1392 b .Lopen_tail_128_rounds 1393 1394 .Lopen_tail_128_rounds_done: 1395 add $A0.4s, $A0.4s, $CONSTS.4s 1396 add $A1.4s, $A1.4s, $CONSTS.4s 1397 add $B0.4s, $B0.4s, $B_STORE.4s 1398 add $B1.4s, $B1.4s, $B_STORE.4s 1399 add $C0.4s, $C0.4s, $C_STORE.4s 1400 add $C1.4s, $C1.4s, $C_STORE.4s 1401 add $D0.4s, $D0.4s, $D_STORE.4s 1402 add $D1.4s, $D1.4s, $D_STORE.4s 1403 add $D0.4s, $D0.4s, $T2.4s 1404 add $D1.4s, $D1.4s, $T3.4s 1405 1406 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1407 1408 eor $T0.16b, $T0.16b, $A1.16b 1409 eor $T1.16b, $T1.16b, $B1.16b 1410 eor $T2.16b, $T2.16b, $C1.16b 1411 eor $T3.16b, $T3.16b, $D1.16b 1412 1413 st1 {$T0.16b - $T3.16b}, [$oup], #64 1414 sub $inl, $inl, #64 1415 1416 b .Lopen_tail_64_store 1417 1418 .Lopen_tail_64: 1419 // We just need a single block 1420 mov $A0.16b, $CONSTS.16b 1421 mov $B0.16b, $B_STORE.16b 1422 mov $C0.16b, $C_STORE.16b 1423 mov $D0.16b, $D_STORE.16b 1424 eor $T3.16b, $T3.16b, $T3.16b 1425 ins $T3.s[0], $INC.s[0] 1426 add $D0.4s, $D0.4s, $T3.4s 1427 1428 mov $itr1, #10 1429 sub $itr1, $itr1, $adl 1430 1431 .Lopen_tail_64_rounds: 1432 ___ 1433 &chacha_qr($A0, $B0, $C0, $D0, $T0, "left"); 1434 &chacha_qr($A0, $B0, $C0, $D0, $T0, "right"); 1435 $code.=<<___; 1436 subs $itr1, $itr1, #1 1437 b.gt .Lopen_tail_64_rounds 1438 cbz $adl, .Lopen_tail_64_rounds_done 1439 subs $adl, $adl, #1 1440 ___ 1441 &poly_add($adp); 1442 &poly_mul(); 1443 $code.=<<___; 1444 b .Lopen_tail_64_rounds 1445 1446 .Lopen_tail_64_rounds_done: 1447 add $A0.4s, $A0.4s, $CONSTS.4s 1448 add $B0.4s, $B0.4s, $B_STORE.4s 1449 add $C0.4s, $C0.4s, $C_STORE.4s 1450 add $D0.4s, $D0.4s, $D_STORE.4s 1451 add $D0.4s, $D0.4s, $T3.4s 1452 1453 .Lopen_tail_64_store: 1454 cmp $inl, #16 1455 b.lt .Lopen_tail_16 1456 1457 ld1 {$T0.16b}, [$inp], #16 1458 eor $T0.16b, $T0.16b, $A0.16b 1459 st1 {$T0.16b}, [$oup], #16 1460 mov $A0.16b, $B0.16b 1461 mov $B0.16b, $C0.16b 1462 mov $C0.16b, $D0.16b 1463 sub $inl, $inl, #16 1464 b .Lopen_tail_64_store 1465 1466 .Lopen_tail_16: 1467 // Here we handle the last [0,16) bytes that require a padded block 1468 cbz $inl, .Lopen_finalize 1469 1470 eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the ciphertext 1471 eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask 1472 not $T2.16b, $T0.16b 1473 1474 add $itr2, $inp, $inl 1475 mov $itr1, $inl 1476 1477 .Lopen_tail_16_compose: 1478 ext $T0.16b, $T0.16b, $T0.16b, #15 1479 ldrb $t0w, [$itr2, #-1]! 1480 mov $T0.b[0], $t0w 1481 ext $T1.16b, $T2.16b, $T1.16b, #15 1482 subs $inl, $inl, #1 1483 b.gt .Lopen_tail_16_compose 1484 1485 and $T0.16b, $T0.16b, $T1.16b 1486 // Hash in the final padded block 1487 ___ 1488 &poly_add_vec($T0); 1489 &poly_mul(); 1490 $code.=<<___; 1491 eor $T0.16b, $T0.16b, $A0.16b 1492 1493 .Lopen_tail_16_store: 1494 umov $t0w, $T0.b[0] 1495 strb $t0w, [$oup], #1 1496 ext $T0.16b, $T0.16b, $T0.16b, #1 1497 subs $itr1, $itr1, #1 1498 b.gt .Lopen_tail_16_store 1499 1500 .Lopen_finalize: 1501 ___ 1502 &poly_add_vec($LEN_STORE); 1503 &poly_mul(); 1504 $code.=<<___; 1505 // Final reduction step 1506 sub $t1, xzr, $one 1507 orr $t2, xzr, #3 1508 subs $t0, $acc0, #-5 1509 sbcs $t1, $acc1, $t1 1510 sbcs $t2, $acc2, $t2 1511 csel $acc0, $t0, $acc0, cs 1512 csel $acc1, $t1, $acc1, cs 1513 csel $acc2, $t2, $acc2, cs 1514 ___ 1515 &poly_add_vec($S_STORE); 1516 $code.=<<___; 1517 1518 stp $acc0, $acc1, [$keyp] 1519 1520 ldp d8, d9, [sp, #16] 1521 ldp d10, d11, [sp, #32] 1522 ldp d12, d13, [sp, #48] 1523 ldp d14, d15, [sp, #64] 1524 .cfi_restore b15 1525 .cfi_restore b14 1526 .cfi_restore b13 1527 .cfi_restore b12 1528 .cfi_restore b11 1529 .cfi_restore b10 1530 .cfi_restore b9 1531 .cfi_restore b8 1532 ldp x29, x30, [sp], 80 1533 .cfi_restore w29 1534 .cfi_restore w30 1535 .cfi_def_cfa_offset 0 1536 AARCH64_VALIDATE_LINK_REGISTER 1537 ret 1538 1539 .Lopen_128: 1540 // On some architectures preparing 5 blocks for small buffers is wasteful 1541 eor $INC.16b, $INC.16b, $INC.16b 1542 mov $t0, #1 1543 mov $INC.s[0], $t0w 1544 mov $A0.16b, $CONSTS.16b 1545 mov $A1.16b, $CONSTS.16b 1546 mov $A2.16b, $CONSTS.16b 1547 mov $B0.16b, $B_STORE.16b 1548 mov $B1.16b, $B_STORE.16b 1549 mov $B2.16b, $B_STORE.16b 1550 mov $C0.16b, $C_STORE.16b 1551 mov $C1.16b, $C_STORE.16b 1552 mov $C2.16b, $C_STORE.16b 1553 mov $D2.16b, $D_STORE.16b 1554 add $D0.4s, $D2.4s, $INC.4s 1555 add $D1.4s, $D0.4s, $INC.4s 1556 1557 mov $itr1, #10 1558 1559 .Lopen_128_rounds: 1560 ___ 1561 &chacha_qr_x3("left"); 1562 &chacha_qr_x3("right"); 1563 $code.=<<___; 1564 subs $itr1, $itr1, #1 1565 b.hi .Lopen_128_rounds 1566 1567 add $A0.4s, $A0.4s, $CONSTS.4s 1568 add $A1.4s, $A1.4s, $CONSTS.4s 1569 add $A2.4s, $A2.4s, $CONSTS.4s 1570 1571 add $B0.4s, $B0.4s, $B_STORE.4s 1572 add $B1.4s, $B1.4s, $B_STORE.4s 1573 add $B2.4s, $B2.4s, $B_STORE.4s 1574 1575 add $C0.4s, $C0.4s, $C_STORE.4s 1576 add $C1.4s, $C1.4s, $C_STORE.4s 1577 1578 add $D_STORE.4s, $D_STORE.4s, $INC.4s 1579 add $D0.4s, $D0.4s, $D_STORE.4s 1580 add $D_STORE.4s, $D_STORE.4s, $INC.4s 1581 add $D1.4s, $D1.4s, $D_STORE.4s 1582 1583 and $A2.16b, $A2.16b, $CLAMP.16b 1584 mov $r0, $A2.d[0] // Move the R key to GPRs 1585 mov $r1, $A2.d[1] 1586 mov $S_STORE.16b, $B2.16b // Store the S key 1587 1588 bl .Lpoly_hash_ad_internal 1589 1590 .Lopen_128_store: 1591 cmp $inl, #64 1592 b.lt .Lopen_128_store_64 1593 1594 ld1 {$T0.16b - $T3.16b}, [$inp], #64 1595 1596 ___ 1597 &poly_add_vec($T0); 1598 &poly_mul(); 1599 &poly_add_vec($T1); 1600 &poly_mul(); 1601 &poly_add_vec($T2); 1602 &poly_mul(); 1603 &poly_add_vec($T3); 1604 &poly_mul(); 1605 $code.=<<___; 1606 1607 eor $T0.16b, $T0.16b, $A0.16b 1608 eor $T1.16b, $T1.16b, $B0.16b 1609 eor $T2.16b, $T2.16b, $C0.16b 1610 eor $T3.16b, $T3.16b, $D0.16b 1611 1612 st1 {$T0.16b - $T3.16b}, [$oup], #64 1613 1614 sub $inl, $inl, #64 1615 1616 mov $A0.16b, $A1.16b 1617 mov $B0.16b, $B1.16b 1618 mov $C0.16b, $C1.16b 1619 mov $D0.16b, $D1.16b 1620 1621 .Lopen_128_store_64: 1622 1623 lsr $adl, $inl, #4 1624 mov $adp, $inp 1625 1626 .Lopen_128_hash_64: 1627 cbz $adl, .Lopen_tail_64_store 1628 ___ 1629 &poly_add($adp); 1630 &poly_mul(); 1631 $code.=<<___; 1632 sub $adl, $adl, #1 1633 b .Lopen_128_hash_64 1634 .cfi_endproc 1635 .size chacha20_poly1305_open,.-chacha20_poly1305_open 1636 ___ 1637 } 1638 1639 foreach (split("\n",$code)) { 1640 s/\`([^\`]*)\`/eval $1/ge; 1641 1642 print $_,"\n"; 1643 } 1644 close STDOUT or die "error closing STDOUT";