git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/asm/chacha_armv4.pl (about) 1 #! /usr/bin/env perl 2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the OpenSSL license (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 # 10 # ==================================================================== 11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12 # project. The module is, however, dual licensed under OpenSSL and 13 # CRYPTOGAMS licenses depending on where you obtain it. For further 14 # details see http://www.openssl.org/~appro/cryptogams/. 15 # ==================================================================== 16 # 17 # December 2014 18 # 19 # ChaCha20 for ARMv4. 20 # 21 # Performance in cycles per byte out of large buffer. 22 # 23 # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU 24 # 25 # Cortex-A5 19.3(*)/+95% 21.8 14.1 26 # Cortex-A8 10.5(*)/+160% 13.9 6.35 27 # Cortex-A9 12.9(**)/+110% 14.3 6.50 28 # Cortex-A15 11.0/+40% 16.0 5.00 29 # Snapdragon S4 11.5/+125% 13.6 4.90 30 # 31 # (*) most "favourable" result for aligned data on little-endian 32 # processor, result for misaligned data is 10-15% lower; 33 # (**) this result is a trade-off: it can be improved by 20%, 34 # but then Snapdragon S4 and Cortex-A8 results get 35 # 20-25% worse; 36 37 $flavour = shift; 38 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 39 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 40 41 if ($flavour && $flavour ne "void") { 42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 44 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 45 die "can't locate arm-xlate.pl"; 46 47 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 48 *STDOUT=*OUT; 49 } else { 50 open OUT,">$output"; 51 *STDOUT=*OUT; 52 } 53 54 sub AUTOLOAD() # thunk [simplified] x86-style perlasm 55 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 56 my $arg = pop; 57 $arg = "#$arg" if ($arg*1 eq $arg); 58 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 59 } 60 61 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); 62 my @t=map("r$_",(8..11)); 63 64 sub ROUND { 65 my ($a0,$b0,$c0,$d0)=@_; 66 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 67 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 68 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 69 my $odd = $d0&1; 70 my ($xc,$xc_) = (@t[0..1]); 71 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); 72 my @ret; 73 74 # Consider order in which variables are addressed by their 75 # index: 76 # 77 # a b c d 78 # 79 # 0 4 8 12 < even round 80 # 1 5 9 13 81 # 2 6 10 14 82 # 3 7 11 15 83 # 0 5 10 15 < odd round 84 # 1 6 11 12 85 # 2 7 8 13 86 # 3 4 9 14 87 # 88 # 'a', 'b' are permanently allocated in registers, @x[0..7], 89 # while 'c's and pair of 'd's are maintained in memory. If 90 # you observe 'c' column, you'll notice that pair of 'c's is 91 # invariant between rounds. This means that we have to reload 92 # them once per round, in the middle. This is why you'll see 93 # bunch of 'c' stores and loads in the middle, but none in 94 # the beginning or end. If you observe 'd' column, you'll 95 # notice that 15 and 13 are reused in next pair of rounds. 96 # This is why these two are chosen for offloading to memory, 97 # to make loads count more. 98 push @ret,( 99 "&add (@x[$a0],@x[$a0],@x[$b0])", 100 "&mov ($xd,$xd,'ror#16')", 101 "&add (@x[$a1],@x[$a1],@x[$b1])", 102 "&mov ($xd_,$xd_,'ror#16')", 103 "&eor ($xd,$xd,@x[$a0],'ror#16')", 104 "&eor ($xd_,$xd_,@x[$a1],'ror#16')", 105 106 "&add ($xc,$xc,$xd)", 107 "&mov (@x[$b0],@x[$b0],'ror#20')", 108 "&add ($xc_,$xc_,$xd_)", 109 "&mov (@x[$b1],@x[$b1],'ror#20')", 110 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", 111 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", 112 113 "&add (@x[$a0],@x[$a0],@x[$b0])", 114 "&mov ($xd,$xd,'ror#24')", 115 "&add (@x[$a1],@x[$a1],@x[$b1])", 116 "&mov ($xd_,$xd_,'ror#24')", 117 "&eor ($xd,$xd,@x[$a0],'ror#24')", 118 "&eor ($xd_,$xd_,@x[$a1],'ror#24')", 119 120 "&add ($xc,$xc,$xd)", 121 "&mov (@x[$b0],@x[$b0],'ror#25')" ); 122 push @ret,( 123 "&str ($xd,'[sp,#4*(16+$d0)]')", 124 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); 125 push @ret,( 126 "&add ($xc_,$xc_,$xd_)", 127 "&mov (@x[$b1],@x[$b1],'ror#25')" ); 128 push @ret,( 129 "&str ($xd_,'[sp,#4*(16+$d1)]')", 130 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); 131 push @ret,( 132 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", 133 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); 134 135 $xd=@x[$d2] if (!$odd); 136 $xd_=@x[$d3] if ($odd); 137 push @ret,( 138 "&str ($xc,'[sp,#4*(16+$c0)]')", 139 "&ldr ($xc,'[sp,#4*(16+$c2)]')", 140 "&add (@x[$a2],@x[$a2],@x[$b2])", 141 "&mov ($xd,$xd,'ror#16')", 142 "&str ($xc_,'[sp,#4*(16+$c1)]')", 143 "&ldr ($xc_,'[sp,#4*(16+$c3)]')", 144 "&add (@x[$a3],@x[$a3],@x[$b3])", 145 "&mov ($xd_,$xd_,'ror#16')", 146 "&eor ($xd,$xd,@x[$a2],'ror#16')", 147 "&eor ($xd_,$xd_,@x[$a3],'ror#16')", 148 149 "&add ($xc,$xc,$xd)", 150 "&mov (@x[$b2],@x[$b2],'ror#20')", 151 "&add ($xc_,$xc_,$xd_)", 152 "&mov (@x[$b3],@x[$b3],'ror#20')", 153 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", 154 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", 155 156 "&add (@x[$a2],@x[$a2],@x[$b2])", 157 "&mov ($xd,$xd,'ror#24')", 158 "&add (@x[$a3],@x[$a3],@x[$b3])", 159 "&mov ($xd_,$xd_,'ror#24')", 160 "&eor ($xd,$xd,@x[$a2],'ror#24')", 161 "&eor ($xd_,$xd_,@x[$a3],'ror#24')", 162 163 "&add ($xc,$xc,$xd)", 164 "&mov (@x[$b2],@x[$b2],'ror#25')", 165 "&add ($xc_,$xc_,$xd_)", 166 "&mov (@x[$b3],@x[$b3],'ror#25')", 167 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", 168 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); 169 170 @ret; 171 } 172 173 $code.=<<___; 174 #include <ring-core/arm_arch.h> 175 176 @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 177 @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. 178 .arch armv7-a 179 180 .text 181 #if defined(__thumb2__) || defined(__clang__) 182 .syntax unified 183 #endif 184 #if defined(__thumb2__) 185 .thumb 186 #else 187 .code 32 188 #endif 189 190 #if defined(__thumb2__) || defined(__clang__) 191 #define ldrhsb ldrbhs 192 #endif 193 194 .align 5 195 .Lsigma: 196 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral 197 .Lone: 198 .long 1,0,0,0 199 #if __ARM_MAX_ARCH__>=7 200 .extern OPENSSL_armcap_P 201 .hidden OPENSSL_armcap_P 202 .LOPENSSL_armcap: 203 .word OPENSSL_armcap_P-.LChaCha20_ctr32 204 #else 205 .word -1 206 #endif 207 208 .globl ChaCha20_ctr32 209 .type ChaCha20_ctr32,%function 210 .align 5 211 ChaCha20_ctr32: 212 .LChaCha20_ctr32: 213 ldr r12,[sp,#0] @ pull pointer to counter and nonce 214 stmdb sp!,{r0-r2,r4-r11,lr} 215 #if __ARM_ARCH__<7 && !defined(__thumb2__) 216 sub r14,pc,#16 @ ChaCha20_ctr32 217 #else 218 adr r14,.LChaCha20_ctr32 219 #endif 220 cmp r2,#0 @ len==0? 221 #ifdef __thumb2__ 222 itt eq 223 #endif 224 addeq sp,sp,#4*3 225 beq .Lno_data 226 #if __ARM_MAX_ARCH__>=7 227 cmp r2,#192 @ test len 228 bls .Lshort 229 ldr r4,[r14,#-32] 230 ldr r4,[r14,r4] 231 # ifdef __APPLE__ 232 ldr r4,[r4] 233 # endif 234 tst r4,#ARMV7_NEON 235 bne .LChaCha20_neon 236 .Lshort: 237 #endif 238 ldmia r12,{r4-r7} @ load counter and nonce 239 sub sp,sp,#4*(16) @ off-load area 240 sub r14,r14,#64 @ .Lsigma 241 stmdb sp!,{r4-r7} @ copy counter and nonce 242 ldmia r3,{r4-r11} @ load key 243 ldmia r14,{r0-r3} @ load sigma 244 stmdb sp!,{r4-r11} @ copy key 245 stmdb sp!,{r0-r3} @ copy sigma 246 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 247 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 248 b .Loop_outer_enter 249 250 .align 4 251 .Loop_outer: 252 ldmia sp,{r0-r9} @ load key material 253 str @t[3],[sp,#4*(32+2)] @ save len 254 str r12, [sp,#4*(32+1)] @ save inp 255 str r14, [sp,#4*(32+0)] @ save out 256 .Loop_outer_enter: 257 ldr @t[3], [sp,#4*(15)] 258 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 259 ldr @t[2], [sp,#4*(13)] 260 ldr @x[14],[sp,#4*(14)] 261 str @t[3], [sp,#4*(16+15)] 262 mov @t[3],#10 263 b .Loop 264 265 .align 4 266 .Loop: 267 subs @t[3],@t[3],#1 268 ___ 269 foreach (&ROUND(0, 4, 8,12)) { eval; } 270 foreach (&ROUND(0, 5,10,15)) { eval; } 271 $code.=<<___; 272 bne .Loop 273 274 ldr @t[3],[sp,#4*(32+2)] @ load len 275 276 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 277 str @t[1], [sp,#4*(16+9)] 278 str @x[12],[sp,#4*(16+12)] 279 str @t[2], [sp,#4*(16+13)] 280 str @x[14],[sp,#4*(16+14)] 281 282 @ at this point we have first half of 512-bit result in 283 @ @x[0-7] and second half at sp+4*(16+8) 284 285 cmp @t[3],#64 @ done yet? 286 #ifdef __thumb2__ 287 itete lo 288 #endif 289 addlo r12,sp,#4*(0) @ shortcut or ... 290 ldrhs r12,[sp,#4*(32+1)] @ ... load inp 291 addlo r14,sp,#4*(0) @ shortcut or ... 292 ldrhs r14,[sp,#4*(32+0)] @ ... load out 293 294 ldr @t[0],[sp,#4*(0)] @ load key material 295 ldr @t[1],[sp,#4*(1)] 296 297 #if __ARM_ARCH__>=6 || !defined(__ARMEB__) 298 # if __ARM_ARCH__<7 299 orr @t[2],r12,r14 300 tst @t[2],#3 @ are input and output aligned? 301 ldr @t[2],[sp,#4*(2)] 302 bne .Lunaligned 303 cmp @t[3],#64 @ restore flags 304 # else 305 ldr @t[2],[sp,#4*(2)] 306 # endif 307 ldr @t[3],[sp,#4*(3)] 308 309 add @x[0],@x[0],@t[0] @ accumulate key material 310 add @x[1],@x[1],@t[1] 311 # ifdef __thumb2__ 312 itt hs 313 # endif 314 ldrhs @t[0],[r12],#16 @ load input 315 ldrhs @t[1],[r12,#-12] 316 317 add @x[2],@x[2],@t[2] 318 add @x[3],@x[3],@t[3] 319 # ifdef __thumb2__ 320 itt hs 321 # endif 322 ldrhs @t[2],[r12,#-8] 323 ldrhs @t[3],[r12,#-4] 324 # if __ARM_ARCH__>=6 && defined(__ARMEB__) 325 rev @x[0],@x[0] 326 rev @x[1],@x[1] 327 rev @x[2],@x[2] 328 rev @x[3],@x[3] 329 # endif 330 # ifdef __thumb2__ 331 itt hs 332 # endif 333 eorhs @x[0],@x[0],@t[0] @ xor with input 334 eorhs @x[1],@x[1],@t[1] 335 add @t[0],sp,#4*(4) 336 str @x[0],[r14],#16 @ store output 337 # ifdef __thumb2__ 338 itt hs 339 # endif 340 eorhs @x[2],@x[2],@t[2] 341 eorhs @x[3],@x[3],@t[3] 342 ldmia @t[0],{@t[0]-@t[3]} @ load key material 343 str @x[1],[r14,#-12] 344 str @x[2],[r14,#-8] 345 str @x[3],[r14,#-4] 346 347 add @x[4],@x[4],@t[0] @ accumulate key material 348 add @x[5],@x[5],@t[1] 349 # ifdef __thumb2__ 350 itt hs 351 # endif 352 ldrhs @t[0],[r12],#16 @ load input 353 ldrhs @t[1],[r12,#-12] 354 add @x[6],@x[6],@t[2] 355 add @x[7],@x[7],@t[3] 356 # ifdef __thumb2__ 357 itt hs 358 # endif 359 ldrhs @t[2],[r12,#-8] 360 ldrhs @t[3],[r12,#-4] 361 # if __ARM_ARCH__>=6 && defined(__ARMEB__) 362 rev @x[4],@x[4] 363 rev @x[5],@x[5] 364 rev @x[6],@x[6] 365 rev @x[7],@x[7] 366 # endif 367 # ifdef __thumb2__ 368 itt hs 369 # endif 370 eorhs @x[4],@x[4],@t[0] 371 eorhs @x[5],@x[5],@t[1] 372 add @t[0],sp,#4*(8) 373 str @x[4],[r14],#16 @ store output 374 # ifdef __thumb2__ 375 itt hs 376 # endif 377 eorhs @x[6],@x[6],@t[2] 378 eorhs @x[7],@x[7],@t[3] 379 str @x[5],[r14,#-12] 380 ldmia @t[0],{@t[0]-@t[3]} @ load key material 381 str @x[6],[r14,#-8] 382 add @x[0],sp,#4*(16+8) 383 str @x[7],[r14,#-4] 384 385 ldmia @x[0],{@x[0]-@x[7]} @ load second half 386 387 add @x[0],@x[0],@t[0] @ accumulate key material 388 add @x[1],@x[1],@t[1] 389 # ifdef __thumb2__ 390 itt hs 391 # endif 392 ldrhs @t[0],[r12],#16 @ load input 393 ldrhs @t[1],[r12,#-12] 394 # ifdef __thumb2__ 395 itt hi 396 # endif 397 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 398 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 399 add @x[2],@x[2],@t[2] 400 add @x[3],@x[3],@t[3] 401 # ifdef __thumb2__ 402 itt hs 403 # endif 404 ldrhs @t[2],[r12,#-8] 405 ldrhs @t[3],[r12,#-4] 406 # if __ARM_ARCH__>=6 && defined(__ARMEB__) 407 rev @x[0],@x[0] 408 rev @x[1],@x[1] 409 rev @x[2],@x[2] 410 rev @x[3],@x[3] 411 # endif 412 # ifdef __thumb2__ 413 itt hs 414 # endif 415 eorhs @x[0],@x[0],@t[0] 416 eorhs @x[1],@x[1],@t[1] 417 add @t[0],sp,#4*(12) 418 str @x[0],[r14],#16 @ store output 419 # ifdef __thumb2__ 420 itt hs 421 # endif 422 eorhs @x[2],@x[2],@t[2] 423 eorhs @x[3],@x[3],@t[3] 424 str @x[1],[r14,#-12] 425 ldmia @t[0],{@t[0]-@t[3]} @ load key material 426 str @x[2],[r14,#-8] 427 str @x[3],[r14,#-4] 428 429 add @x[4],@x[4],@t[0] @ accumulate key material 430 add @x[5],@x[5],@t[1] 431 # ifdef __thumb2__ 432 itt hi 433 # endif 434 addhi @t[0],@t[0],#1 @ next counter value 435 strhi @t[0],[sp,#4*(12)] @ save next counter value 436 # ifdef __thumb2__ 437 itt hs 438 # endif 439 ldrhs @t[0],[r12],#16 @ load input 440 ldrhs @t[1],[r12,#-12] 441 add @x[6],@x[6],@t[2] 442 add @x[7],@x[7],@t[3] 443 # ifdef __thumb2__ 444 itt hs 445 # endif 446 ldrhs @t[2],[r12,#-8] 447 ldrhs @t[3],[r12,#-4] 448 # if __ARM_ARCH__>=6 && defined(__ARMEB__) 449 rev @x[4],@x[4] 450 rev @x[5],@x[5] 451 rev @x[6],@x[6] 452 rev @x[7],@x[7] 453 # endif 454 # ifdef __thumb2__ 455 itt hs 456 # endif 457 eorhs @x[4],@x[4],@t[0] 458 eorhs @x[5],@x[5],@t[1] 459 # ifdef __thumb2__ 460 it ne 461 # endif 462 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 463 # ifdef __thumb2__ 464 itt hs 465 # endif 466 eorhs @x[6],@x[6],@t[2] 467 eorhs @x[7],@x[7],@t[3] 468 str @x[4],[r14],#16 @ store output 469 str @x[5],[r14,#-12] 470 # ifdef __thumb2__ 471 it hs 472 # endif 473 subhs @t[3],@t[0],#64 @ len-=64 474 str @x[6],[r14,#-8] 475 str @x[7],[r14,#-4] 476 bhi .Loop_outer 477 478 beq .Ldone 479 # if __ARM_ARCH__<7 480 b .Ltail 481 482 .align 4 483 .Lunaligned: @ unaligned endian-neutral path 484 cmp @t[3],#64 @ restore flags 485 # endif 486 #endif 487 #if __ARM_ARCH__<7 488 ldr @t[3],[sp,#4*(3)] 489 ___ 490 for ($i=0;$i<16;$i+=4) { 491 my $j=$i&0x7; 492 493 $code.=<<___ if ($i==4); 494 add @x[0],sp,#4*(16+8) 495 ___ 496 $code.=<<___ if ($i==8); 497 ldmia @x[0],{@x[0]-@x[7]} @ load second half 498 # ifdef __thumb2__ 499 itt hi 500 # endif 501 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" 502 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" 503 ___ 504 $code.=<<___; 505 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material 506 ___ 507 $code.=<<___ if ($i==12); 508 # ifdef __thumb2__ 509 itt hi 510 # endif 511 addhi @t[0],@t[0],#1 @ next counter value 512 strhi @t[0],[sp,#4*(12)] @ save next counter value 513 ___ 514 $code.=<<___; 515 add @x[$j+1],@x[$j+1],@t[1] 516 add @x[$j+2],@x[$j+2],@t[2] 517 # ifdef __thumb2__ 518 itete lo 519 # endif 520 eorlo @t[0],@t[0],@t[0] @ zero or ... 521 ldrhsb @t[0],[r12],#16 @ ... load input 522 eorlo @t[1],@t[1],@t[1] 523 ldrhsb @t[1],[r12,#-12] 524 525 add @x[$j+3],@x[$j+3],@t[3] 526 # ifdef __thumb2__ 527 itete lo 528 # endif 529 eorlo @t[2],@t[2],@t[2] 530 ldrhsb @t[2],[r12,#-8] 531 eorlo @t[3],@t[3],@t[3] 532 ldrhsb @t[3],[r12,#-4] 533 534 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) 535 eor @x[$j+1],@t[1],@x[$j+1] 536 # ifdef __thumb2__ 537 itt hs 538 # endif 539 ldrhsb @t[0],[r12,#-15] @ load more input 540 ldrhsb @t[1],[r12,#-11] 541 eor @x[$j+2],@t[2],@x[$j+2] 542 strb @x[$j+0],[r14],#16 @ store output 543 eor @x[$j+3],@t[3],@x[$j+3] 544 # ifdef __thumb2__ 545 itt hs 546 # endif 547 ldrhsb @t[2],[r12,#-7] 548 ldrhsb @t[3],[r12,#-3] 549 strb @x[$j+1],[r14,#-12] 550 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 551 strb @x[$j+2],[r14,#-8] 552 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 553 # ifdef __thumb2__ 554 itt hs 555 # endif 556 ldrhsb @t[0],[r12,#-14] @ load more input 557 ldrhsb @t[1],[r12,#-10] 558 strb @x[$j+3],[r14,#-4] 559 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 560 strb @x[$j+0],[r14,#-15] 561 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 562 # ifdef __thumb2__ 563 itt hs 564 # endif 565 ldrhsb @t[2],[r12,#-6] 566 ldrhsb @t[3],[r12,#-2] 567 strb @x[$j+1],[r14,#-11] 568 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 569 strb @x[$j+2],[r14,#-7] 570 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 571 # ifdef __thumb2__ 572 itt hs 573 # endif 574 ldrhsb @t[0],[r12,#-13] @ load more input 575 ldrhsb @t[1],[r12,#-9] 576 strb @x[$j+3],[r14,#-3] 577 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 578 strb @x[$j+0],[r14,#-14] 579 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 580 # ifdef __thumb2__ 581 itt hs 582 # endif 583 ldrhsb @t[2],[r12,#-5] 584 ldrhsb @t[3],[r12,#-1] 585 strb @x[$j+1],[r14,#-10] 586 strb @x[$j+2],[r14,#-6] 587 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 588 strb @x[$j+3],[r14,#-2] 589 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 590 strb @x[$j+0],[r14,#-13] 591 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 592 strb @x[$j+1],[r14,#-9] 593 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 594 strb @x[$j+2],[r14,#-5] 595 strb @x[$j+3],[r14,#-1] 596 ___ 597 $code.=<<___ if ($i<12); 598 add @t[0],sp,#4*(4+$i) 599 ldmia @t[0],{@t[0]-@t[3]} @ load key material 600 ___ 601 } 602 $code.=<<___; 603 # ifdef __thumb2__ 604 it ne 605 # endif 606 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 607 # ifdef __thumb2__ 608 it hs 609 # endif 610 subhs @t[3],@t[0],#64 @ len-=64 611 bhi .Loop_outer 612 613 beq .Ldone 614 #endif 615 616 .Ltail: 617 ldr r12,[sp,#4*(32+1)] @ load inp 618 add @t[1],sp,#4*(0) 619 ldr r14,[sp,#4*(32+0)] @ load out 620 621 .Loop_tail: 622 ldrb @t[2],[@t[1]],#1 @ read buffer on stack 623 ldrb @t[3],[r12],#1 @ read input 624 subs @t[0],@t[0],#1 625 eor @t[3],@t[3],@t[2] 626 strb @t[3],[r14],#1 @ store output 627 bne .Loop_tail 628 629 .Ldone: 630 add sp,sp,#4*(32+3) 631 .Lno_data: 632 ldmia sp!,{r4-r11,pc} 633 .size ChaCha20_ctr32,.-ChaCha20_ctr32 634 ___ 635 636 {{{ 637 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = 638 map("q$_",(0..15)); 639 640 sub NEONROUND { 641 my $odd = pop; 642 my ($a,$b,$c,$d,$t)=@_; 643 644 ( 645 "&vadd_i32 ($a,$a,$b)", 646 "&veor ($d,$d,$a)", 647 "&vrev32_16 ($d,$d)", # vrot ($d,16) 648 649 "&vadd_i32 ($c,$c,$d)", 650 "&veor ($t,$b,$c)", 651 "&vshr_u32 ($b,$t,20)", 652 "&vsli_32 ($b,$t,12)", 653 654 "&vadd_i32 ($a,$a,$b)", 655 "&veor ($t,$d,$a)", 656 "&vshr_u32 ($d,$t,24)", 657 "&vsli_32 ($d,$t,8)", 658 659 "&vadd_i32 ($c,$c,$d)", 660 "&veor ($t,$b,$c)", 661 "&vshr_u32 ($b,$t,25)", 662 "&vsli_32 ($b,$t,7)", 663 664 "&vext_8 ($c,$c,$c,8)", 665 "&vext_8 ($b,$b,$b,$odd?12:4)", 666 "&vext_8 ($d,$d,$d,$odd?4:12)" 667 ); 668 } 669 670 $code.=<<___; 671 #if __ARM_MAX_ARCH__>=7 672 .arch armv7-a 673 .fpu neon 674 675 .type ChaCha20_neon,%function 676 .align 5 677 ChaCha20_neon: 678 ldr r12,[sp,#0] @ pull pointer to counter and nonce 679 stmdb sp!,{r0-r2,r4-r11,lr} 680 .LChaCha20_neon: 681 adr r14,.Lsigma 682 vstmdb sp!,{d8-d15} @ ABI spec says so 683 stmdb sp!,{r0-r3} 684 685 vld1.32 {$b0-$c0},[r3] @ load key 686 ldmia r3,{r4-r11} @ load key 687 688 sub sp,sp,#4*(16+16) 689 vld1.32 {$d0},[r12] @ load counter and nonce 690 add r12,sp,#4*8 691 ldmia r14,{r0-r3} @ load sigma 692 vld1.32 {$a0},[r14]! @ load sigma 693 vld1.32 {$t0},[r14] @ one 694 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce 695 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key 696 697 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 698 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 699 vshl.i32 $t1#lo,$t0#lo,#1 @ two 700 vstr $t0#lo,[sp,#4*(16+0)] 701 vshl.i32 $t2#lo,$t0#lo,#2 @ four 702 vstr $t1#lo,[sp,#4*(16+2)] 703 vmov $a1,$a0 704 vstr $t2#lo,[sp,#4*(16+4)] 705 vmov $a2,$a0 706 vmov $b1,$b0 707 vmov $b2,$b0 708 b .Loop_neon_enter 709 710 .align 4 711 .Loop_neon_outer: 712 ldmia sp,{r0-r9} @ load key material 713 cmp @t[3],#64*2 @ if len<=64*2 714 bls .Lbreak_neon @ switch to integer-only 715 vmov $a1,$a0 716 str @t[3],[sp,#4*(32+2)] @ save len 717 vmov $a2,$a0 718 str r12, [sp,#4*(32+1)] @ save inp 719 vmov $b1,$b0 720 str r14, [sp,#4*(32+0)] @ save out 721 vmov $b2,$b0 722 .Loop_neon_enter: 723 ldr @t[3], [sp,#4*(15)] 724 vadd.i32 $d1,$d0,$t0 @ counter+1 725 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 726 vmov $c1,$c0 727 ldr @t[2], [sp,#4*(13)] 728 vmov $c2,$c0 729 ldr @x[14],[sp,#4*(14)] 730 vadd.i32 $d2,$d1,$t0 @ counter+2 731 str @t[3], [sp,#4*(16+15)] 732 mov @t[3],#10 733 add @x[12],@x[12],#3 @ counter+3 734 b .Loop_neon 735 736 .align 4 737 .Loop_neon: 738 subs @t[3],@t[3],#1 739 ___ 740 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); 741 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); 742 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); 743 my @thread3=&ROUND(0,4,8,12); 744 745 foreach (@thread0) { 746 eval; eval(shift(@thread3)); 747 eval(shift(@thread1)); eval(shift(@thread3)); 748 eval(shift(@thread2)); eval(shift(@thread3)); 749 } 750 751 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); 752 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); 753 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); 754 @thread3=&ROUND(0,5,10,15); 755 756 foreach (@thread0) { 757 eval; eval(shift(@thread3)); 758 eval(shift(@thread1)); eval(shift(@thread3)); 759 eval(shift(@thread2)); eval(shift(@thread3)); 760 } 761 $code.=<<___; 762 bne .Loop_neon 763 764 add @t[3],sp,#32 765 vld1.32 {$t0-$t1},[sp] @ load key material 766 vld1.32 {$t2-$t3},[@t[3]] 767 768 ldr @t[3],[sp,#4*(32+2)] @ load len 769 770 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 771 str @t[1], [sp,#4*(16+9)] 772 str @x[12],[sp,#4*(16+12)] 773 str @t[2], [sp,#4*(16+13)] 774 str @x[14],[sp,#4*(16+14)] 775 776 @ at this point we have first half of 512-bit result in 777 @ @x[0-7] and second half at sp+4*(16+8) 778 779 ldr r12,[sp,#4*(32+1)] @ load inp 780 ldr r14,[sp,#4*(32+0)] @ load out 781 782 vadd.i32 $a0,$a0,$t0 @ accumulate key material 783 vadd.i32 $a1,$a1,$t0 784 vadd.i32 $a2,$a2,$t0 785 vldr $t0#lo,[sp,#4*(16+0)] @ one 786 787 vadd.i32 $b0,$b0,$t1 788 vadd.i32 $b1,$b1,$t1 789 vadd.i32 $b2,$b2,$t1 790 vldr $t1#lo,[sp,#4*(16+2)] @ two 791 792 vadd.i32 $c0,$c0,$t2 793 vadd.i32 $c1,$c1,$t2 794 vadd.i32 $c2,$c2,$t2 795 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 796 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 797 798 vadd.i32 $d0,$d0,$t3 799 vadd.i32 $d1,$d1,$t3 800 vadd.i32 $d2,$d2,$t3 801 802 cmp @t[3],#64*4 803 blo .Ltail_neon 804 805 vld1.8 {$t0-$t1},[r12]! @ load input 806 mov @t[3],sp 807 vld1.8 {$t2-$t3},[r12]! 808 veor $a0,$a0,$t0 @ xor with input 809 veor $b0,$b0,$t1 810 vld1.8 {$t0-$t1},[r12]! 811 veor $c0,$c0,$t2 812 veor $d0,$d0,$t3 813 vld1.8 {$t2-$t3},[r12]! 814 815 veor $a1,$a1,$t0 816 vst1.8 {$a0-$b0},[r14]! @ store output 817 veor $b1,$b1,$t1 818 vld1.8 {$t0-$t1},[r12]! 819 veor $c1,$c1,$t2 820 vst1.8 {$c0-$d0},[r14]! 821 veor $d1,$d1,$t3 822 vld1.8 {$t2-$t3},[r12]! 823 824 veor $a2,$a2,$t0 825 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration 826 veor $t0#hi,$t0#hi,$t0#hi 827 vldr $t0#lo,[sp,#4*(16+4)] @ four 828 veor $b2,$b2,$t1 829 vld1.32 {$c0-$d0},[@t[3]] 830 veor $c2,$c2,$t2 831 vst1.8 {$a1-$b1},[r14]! 832 veor $d2,$d2,$t3 833 vst1.8 {$c1-$d1},[r14]! 834 835 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value 836 vldr $t0#lo,[sp,#4*(16+0)] @ one 837 838 ldmia sp,{@t[0]-@t[3]} @ load key material 839 add @x[0],@x[0],@t[0] @ accumulate key material 840 ldr @t[0],[r12],#16 @ load input 841 vst1.8 {$a2-$b2},[r14]! 842 add @x[1],@x[1],@t[1] 843 ldr @t[1],[r12,#-12] 844 vst1.8 {$c2-$d2},[r14]! 845 add @x[2],@x[2],@t[2] 846 ldr @t[2],[r12,#-8] 847 add @x[3],@x[3],@t[3] 848 ldr @t[3],[r12,#-4] 849 # ifdef __ARMEB__ 850 rev @x[0],@x[0] 851 rev @x[1],@x[1] 852 rev @x[2],@x[2] 853 rev @x[3],@x[3] 854 # endif 855 eor @x[0],@x[0],@t[0] @ xor with input 856 add @t[0],sp,#4*(4) 857 eor @x[1],@x[1],@t[1] 858 str @x[0],[r14],#16 @ store output 859 eor @x[2],@x[2],@t[2] 860 str @x[1],[r14,#-12] 861 eor @x[3],@x[3],@t[3] 862 ldmia @t[0],{@t[0]-@t[3]} @ load key material 863 str @x[2],[r14,#-8] 864 str @x[3],[r14,#-4] 865 866 add @x[4],@x[4],@t[0] @ accumulate key material 867 ldr @t[0],[r12],#16 @ load input 868 add @x[5],@x[5],@t[1] 869 ldr @t[1],[r12,#-12] 870 add @x[6],@x[6],@t[2] 871 ldr @t[2],[r12,#-8] 872 add @x[7],@x[7],@t[3] 873 ldr @t[3],[r12,#-4] 874 # ifdef __ARMEB__ 875 rev @x[4],@x[4] 876 rev @x[5],@x[5] 877 rev @x[6],@x[6] 878 rev @x[7],@x[7] 879 # endif 880 eor @x[4],@x[4],@t[0] 881 add @t[0],sp,#4*(8) 882 eor @x[5],@x[5],@t[1] 883 str @x[4],[r14],#16 @ store output 884 eor @x[6],@x[6],@t[2] 885 str @x[5],[r14,#-12] 886 eor @x[7],@x[7],@t[3] 887 ldmia @t[0],{@t[0]-@t[3]} @ load key material 888 str @x[6],[r14,#-8] 889 add @x[0],sp,#4*(16+8) 890 str @x[7],[r14,#-4] 891 892 ldmia @x[0],{@x[0]-@x[7]} @ load second half 893 894 add @x[0],@x[0],@t[0] @ accumulate key material 895 ldr @t[0],[r12],#16 @ load input 896 add @x[1],@x[1],@t[1] 897 ldr @t[1],[r12,#-12] 898 # ifdef __thumb2__ 899 it hi 900 # endif 901 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 902 add @x[2],@x[2],@t[2] 903 ldr @t[2],[r12,#-8] 904 # ifdef __thumb2__ 905 it hi 906 # endif 907 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 908 add @x[3],@x[3],@t[3] 909 ldr @t[3],[r12,#-4] 910 # ifdef __ARMEB__ 911 rev @x[0],@x[0] 912 rev @x[1],@x[1] 913 rev @x[2],@x[2] 914 rev @x[3],@x[3] 915 # endif 916 eor @x[0],@x[0],@t[0] 917 add @t[0],sp,#4*(12) 918 eor @x[1],@x[1],@t[1] 919 str @x[0],[r14],#16 @ store output 920 eor @x[2],@x[2],@t[2] 921 str @x[1],[r14,#-12] 922 eor @x[3],@x[3],@t[3] 923 ldmia @t[0],{@t[0]-@t[3]} @ load key material 924 str @x[2],[r14,#-8] 925 str @x[3],[r14,#-4] 926 927 add @x[4],@x[4],@t[0] @ accumulate key material 928 add @t[0],@t[0],#4 @ next counter value 929 add @x[5],@x[5],@t[1] 930 str @t[0],[sp,#4*(12)] @ save next counter value 931 ldr @t[0],[r12],#16 @ load input 932 add @x[6],@x[6],@t[2] 933 add @x[4],@x[4],#3 @ counter+3 934 ldr @t[1],[r12,#-12] 935 add @x[7],@x[7],@t[3] 936 ldr @t[2],[r12,#-8] 937 ldr @t[3],[r12,#-4] 938 # ifdef __ARMEB__ 939 rev @x[4],@x[4] 940 rev @x[5],@x[5] 941 rev @x[6],@x[6] 942 rev @x[7],@x[7] 943 # endif 944 eor @x[4],@x[4],@t[0] 945 # ifdef __thumb2__ 946 it hi 947 # endif 948 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len 949 eor @x[5],@x[5],@t[1] 950 eor @x[6],@x[6],@t[2] 951 str @x[4],[r14],#16 @ store output 952 eor @x[7],@x[7],@t[3] 953 str @x[5],[r14,#-12] 954 sub @t[3],@t[0],#64*4 @ len-=64*4 955 str @x[6],[r14,#-8] 956 str @x[7],[r14,#-4] 957 bhi .Loop_neon_outer 958 959 b .Ldone_neon 960 961 .align 4 962 .Lbreak_neon: 963 @ harmonize NEON and integer-only stack frames: load data 964 @ from NEON frame, but save to integer-only one; distance 965 @ between the two is 4*(32+4+16-32)=4*(20). 966 967 str @t[3], [sp,#4*(20+32+2)] @ save len 968 add @t[3],sp,#4*(32+4) 969 str r12, [sp,#4*(20+32+1)] @ save inp 970 str r14, [sp,#4*(20+32+0)] @ save out 971 972 ldr @x[12],[sp,#4*(16+10)] 973 ldr @x[14],[sp,#4*(16+11)] 974 vldmia @t[3],{d8-d15} @ fulfill ABI requirement 975 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" 976 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" 977 978 ldr @t[3], [sp,#4*(15)] 979 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 980 ldr @t[2], [sp,#4*(13)] 981 ldr @x[14],[sp,#4*(14)] 982 str @t[3], [sp,#4*(20+16+15)] 983 add @t[3],sp,#4*(20) 984 vst1.32 {$a0-$b0},[@t[3]]! @ copy key 985 add sp,sp,#4*(20) @ switch frame 986 vst1.32 {$c0-$d0},[@t[3]] 987 mov @t[3],#10 988 b .Loop @ go integer-only 989 990 .align 4 991 .Ltail_neon: 992 cmp @t[3],#64*3 993 bhs .L192_or_more_neon 994 cmp @t[3],#64*2 995 bhs .L128_or_more_neon 996 cmp @t[3],#64*1 997 bhs .L64_or_more_neon 998 999 add @t[0],sp,#4*(8) 1000 vst1.8 {$a0-$b0},[sp] 1001 add @t[2],sp,#4*(0) 1002 vst1.8 {$c0-$d0},[@t[0]] 1003 b .Loop_tail_neon 1004 1005 .align 4 1006 .L64_or_more_neon: 1007 vld1.8 {$t0-$t1},[r12]! 1008 vld1.8 {$t2-$t3},[r12]! 1009 veor $a0,$a0,$t0 1010 veor $b0,$b0,$t1 1011 veor $c0,$c0,$t2 1012 veor $d0,$d0,$t3 1013 vst1.8 {$a0-$b0},[r14]! 1014 vst1.8 {$c0-$d0},[r14]! 1015 1016 beq .Ldone_neon 1017 1018 add @t[0],sp,#4*(8) 1019 vst1.8 {$a1-$b1},[sp] 1020 add @t[2],sp,#4*(0) 1021 vst1.8 {$c1-$d1},[@t[0]] 1022 sub @t[3],@t[3],#64*1 @ len-=64*1 1023 b .Loop_tail_neon 1024 1025 .align 4 1026 .L128_or_more_neon: 1027 vld1.8 {$t0-$t1},[r12]! 1028 vld1.8 {$t2-$t3},[r12]! 1029 veor $a0,$a0,$t0 1030 veor $b0,$b0,$t1 1031 vld1.8 {$t0-$t1},[r12]! 1032 veor $c0,$c0,$t2 1033 veor $d0,$d0,$t3 1034 vld1.8 {$t2-$t3},[r12]! 1035 1036 veor $a1,$a1,$t0 1037 veor $b1,$b1,$t1 1038 vst1.8 {$a0-$b0},[r14]! 1039 veor $c1,$c1,$t2 1040 vst1.8 {$c0-$d0},[r14]! 1041 veor $d1,$d1,$t3 1042 vst1.8 {$a1-$b1},[r14]! 1043 vst1.8 {$c1-$d1},[r14]! 1044 1045 beq .Ldone_neon 1046 1047 add @t[0],sp,#4*(8) 1048 vst1.8 {$a2-$b2},[sp] 1049 add @t[2],sp,#4*(0) 1050 vst1.8 {$c2-$d2},[@t[0]] 1051 sub @t[3],@t[3],#64*2 @ len-=64*2 1052 b .Loop_tail_neon 1053 1054 .align 4 1055 .L192_or_more_neon: 1056 vld1.8 {$t0-$t1},[r12]! 1057 vld1.8 {$t2-$t3},[r12]! 1058 veor $a0,$a0,$t0 1059 veor $b0,$b0,$t1 1060 vld1.8 {$t0-$t1},[r12]! 1061 veor $c0,$c0,$t2 1062 veor $d0,$d0,$t3 1063 vld1.8 {$t2-$t3},[r12]! 1064 1065 veor $a1,$a1,$t0 1066 veor $b1,$b1,$t1 1067 vld1.8 {$t0-$t1},[r12]! 1068 veor $c1,$c1,$t2 1069 vst1.8 {$a0-$b0},[r14]! 1070 veor $d1,$d1,$t3 1071 vld1.8 {$t2-$t3},[r12]! 1072 1073 veor $a2,$a2,$t0 1074 vst1.8 {$c0-$d0},[r14]! 1075 veor $b2,$b2,$t1 1076 vst1.8 {$a1-$b1},[r14]! 1077 veor $c2,$c2,$t2 1078 vst1.8 {$c1-$d1},[r14]! 1079 veor $d2,$d2,$t3 1080 vst1.8 {$a2-$b2},[r14]! 1081 vst1.8 {$c2-$d2},[r14]! 1082 1083 beq .Ldone_neon 1084 1085 ldmia sp,{@t[0]-@t[3]} @ load key material 1086 add @x[0],@x[0],@t[0] @ accumulate key material 1087 add @t[0],sp,#4*(4) 1088 add @x[1],@x[1],@t[1] 1089 add @x[2],@x[2],@t[2] 1090 add @x[3],@x[3],@t[3] 1091 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1092 1093 add @x[4],@x[4],@t[0] @ accumulate key material 1094 add @t[0],sp,#4*(8) 1095 add @x[5],@x[5],@t[1] 1096 add @x[6],@x[6],@t[2] 1097 add @x[7],@x[7],@t[3] 1098 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1099 # ifdef __ARMEB__ 1100 rev @x[0],@x[0] 1101 rev @x[1],@x[1] 1102 rev @x[2],@x[2] 1103 rev @x[3],@x[3] 1104 rev @x[4],@x[4] 1105 rev @x[5],@x[5] 1106 rev @x[6],@x[6] 1107 rev @x[7],@x[7] 1108 # endif 1109 stmia sp,{@x[0]-@x[7]} 1110 add @x[0],sp,#4*(16+8) 1111 1112 ldmia @x[0],{@x[0]-@x[7]} @ load second half 1113 1114 add @x[0],@x[0],@t[0] @ accumulate key material 1115 add @t[0],sp,#4*(12) 1116 add @x[1],@x[1],@t[1] 1117 add @x[2],@x[2],@t[2] 1118 add @x[3],@x[3],@t[3] 1119 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1120 1121 add @x[4],@x[4],@t[0] @ accumulate key material 1122 add @t[0],sp,#4*(8) 1123 add @x[5],@x[5],@t[1] 1124 add @x[4],@x[4],#3 @ counter+3 1125 add @x[6],@x[6],@t[2] 1126 add @x[7],@x[7],@t[3] 1127 ldr @t[3],[sp,#4*(32+2)] @ re-load len 1128 # ifdef __ARMEB__ 1129 rev @x[0],@x[0] 1130 rev @x[1],@x[1] 1131 rev @x[2],@x[2] 1132 rev @x[3],@x[3] 1133 rev @x[4],@x[4] 1134 rev @x[5],@x[5] 1135 rev @x[6],@x[6] 1136 rev @x[7],@x[7] 1137 # endif 1138 stmia @t[0],{@x[0]-@x[7]} 1139 add @t[2],sp,#4*(0) 1140 sub @t[3],@t[3],#64*3 @ len-=64*3 1141 1142 .Loop_tail_neon: 1143 ldrb @t[0],[@t[2]],#1 @ read buffer on stack 1144 ldrb @t[1],[r12],#1 @ read input 1145 subs @t[3],@t[3],#1 1146 eor @t[0],@t[0],@t[1] 1147 strb @t[0],[r14],#1 @ store output 1148 bne .Loop_tail_neon 1149 1150 .Ldone_neon: 1151 add sp,sp,#4*(32+4) 1152 vldmia sp,{d8-d15} 1153 add sp,sp,#4*(16+3) 1154 ldmia sp!,{r4-r11,pc} 1155 .size ChaCha20_neon,.-ChaCha20_neon 1156 #endif 1157 ___ 1158 }}} 1159 1160 foreach (split("\n",$code)) { 1161 s/\`([^\`]*)\`/eval $1/geo; 1162 1163 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1164 1165 print $_,"\n"; 1166 } 1167 close STDOUT or die "error closing STDOUT: $!";