github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/crypto/poly1305/sum_arm.s (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build arm,!gccgo,!appengine,!nacl 6 7 #include "textflag.h" 8 9 // This code was translated into a form compatible with 5a from the public 10 // domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305. 11 12 DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff 13 DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03 14 DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff 15 DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff 16 DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff 17 GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20 18 19 // Warning: the linker may use R11 to synthesize certain instructions. Please 20 // take care and verify that no synthetic instructions use it. 21 22 TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0 23 // Needs 16 bytes of stack and 64 bytes of space pointed to by R0. (It 24 // might look like it's only 60 bytes of space but the final four bytes 25 // will be written by another function.) We need to skip over four 26 // bytes of stack because that's saving the value of 'g'. 27 ADD $4, R13, R8 28 MOVM.IB [R4-R7], (R8) 29 MOVM.IA.W (R1), [R2-R5] 30 MOVW $·poly1305_init_constants_armv6<>(SB), R7 31 MOVW R2, R8 32 MOVW R2>>26, R9 33 MOVW R3>>20, g 34 MOVW R4>>14, R11 35 MOVW R5>>8, R12 36 ORR R3<<6, R9, R9 37 ORR R4<<12, g, g 38 ORR R5<<18, R11, R11 39 MOVM.IA (R7), [R2-R6] 40 AND R8, R2, R2 41 AND R9, R3, R3 42 AND g, R4, R4 43 AND R11, R5, R5 44 AND R12, R6, R6 45 MOVM.IA.W [R2-R6], (R0) 46 EOR R2, R2, R2 47 EOR R3, R3, R3 48 EOR R4, R4, R4 49 EOR R5, R5, R5 50 EOR R6, R6, R6 51 MOVM.IA.W [R2-R6], (R0) 52 MOVM.IA.W (R1), [R2-R5] 53 MOVM.IA [R2-R6], (R0) 54 ADD $20, R13, R0 55 MOVM.DA (R0), [R4-R7] 56 RET 57 58 #define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \ 59 MOVBU (offset+0)(Rsrc), Rtmp; \ 60 MOVBU Rtmp, (offset+0)(Rdst); \ 61 MOVBU (offset+1)(Rsrc), Rtmp; \ 62 MOVBU Rtmp, (offset+1)(Rdst); \ 63 MOVBU (offset+2)(Rsrc), Rtmp; \ 64 MOVBU Rtmp, (offset+2)(Rdst); \ 65 MOVBU (offset+3)(Rsrc), Rtmp; \ 66 MOVBU Rtmp, (offset+3)(Rdst) 67 68 TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0 69 // Needs 24 bytes of stack for saved registers and then 88 bytes of 70 // scratch space after that. We assume that 24 bytes at (R13) have 71 // already been used: four bytes for the link register saved in the 72 // prelude of poly1305_auth_armv6, four bytes for saving the value of g 73 // in that function and 16 bytes of scratch space used around 74 // poly1305_finish_ext_armv6_skip1. 75 ADD $24, R13, R12 76 MOVM.IB [R4-R8, R14], (R12) 77 MOVW R0, 88(R13) 78 MOVW R1, 92(R13) 79 MOVW R2, 96(R13) 80 MOVW R1, R14 81 MOVW R2, R12 82 MOVW 56(R0), R8 83 WORD $0xe1180008 // TST R8, R8 not working see issue 5921 84 EOR R6, R6, R6 85 MOVW.EQ $(1<<24), R6 86 MOVW R6, 84(R13) 87 ADD $116, R13, g 88 MOVM.IA (R0), [R0-R9] 89 MOVM.IA [R0-R4], (g) 90 CMP $16, R12 91 BLO poly1305_blocks_armv6_done 92 93 poly1305_blocks_armv6_mainloop: 94 WORD $0xe31e0003 // TST R14, #3 not working see issue 5921 95 BEQ poly1305_blocks_armv6_mainloop_aligned 96 ADD $100, R13, g 97 MOVW_UNALIGNED(R14, g, R0, 0) 98 MOVW_UNALIGNED(R14, g, R0, 4) 99 MOVW_UNALIGNED(R14, g, R0, 8) 100 MOVW_UNALIGNED(R14, g, R0, 12) 101 MOVM.IA (g), [R0-R3] 102 ADD $16, R14 103 B poly1305_blocks_armv6_mainloop_loaded 104 105 poly1305_blocks_armv6_mainloop_aligned: 106 MOVM.IA.W (R14), [R0-R3] 107 108 poly1305_blocks_armv6_mainloop_loaded: 109 MOVW R0>>26, g 110 MOVW R1>>20, R11 111 MOVW R2>>14, R12 112 MOVW R14, 92(R13) 113 MOVW R3>>8, R4 114 ORR R1<<6, g, g 115 ORR R2<<12, R11, R11 116 ORR R3<<18, R12, R12 117 BIC $0xfc000000, R0, R0 118 BIC $0xfc000000, g, g 119 MOVW 84(R13), R3 120 BIC $0xfc000000, R11, R11 121 BIC $0xfc000000, R12, R12 122 ADD R0, R5, R5 123 ADD g, R6, R6 124 ORR R3, R4, R4 125 ADD R11, R7, R7 126 ADD $116, R13, R14 127 ADD R12, R8, R8 128 ADD R4, R9, R9 129 MOVM.IA (R14), [R0-R4] 130 MULLU R4, R5, (R11, g) 131 MULLU R3, R5, (R14, R12) 132 MULALU R3, R6, (R11, g) 133 MULALU R2, R6, (R14, R12) 134 MULALU R2, R7, (R11, g) 135 MULALU R1, R7, (R14, R12) 136 ADD R4<<2, R4, R4 137 ADD R3<<2, R3, R3 138 MULALU R1, R8, (R11, g) 139 MULALU R0, R8, (R14, R12) 140 MULALU R0, R9, (R11, g) 141 MULALU R4, R9, (R14, R12) 142 MOVW g, 76(R13) 143 MOVW R11, 80(R13) 144 MOVW R12, 68(R13) 145 MOVW R14, 72(R13) 146 MULLU R2, R5, (R11, g) 147 MULLU R1, R5, (R14, R12) 148 MULALU R1, R6, (R11, g) 149 MULALU R0, R6, (R14, R12) 150 MULALU R0, R7, (R11, g) 151 MULALU R4, R7, (R14, R12) 152 ADD R2<<2, R2, R2 153 ADD R1<<2, R1, R1 154 MULALU R4, R8, (R11, g) 155 MULALU R3, R8, (R14, R12) 156 MULALU R3, R9, (R11, g) 157 MULALU R2, R9, (R14, R12) 158 MOVW g, 60(R13) 159 MOVW R11, 64(R13) 160 MOVW R12, 52(R13) 161 MOVW R14, 56(R13) 162 MULLU R0, R5, (R11, g) 163 MULALU R4, R6, (R11, g) 164 MULALU R3, R7, (R11, g) 165 MULALU R2, R8, (R11, g) 166 MULALU R1, R9, (R11, g) 167 ADD $52, R13, R0 168 MOVM.IA (R0), [R0-R7] 169 MOVW g>>26, R12 170 MOVW R4>>26, R14 171 ORR R11<<6, R12, R12 172 ORR R5<<6, R14, R14 173 BIC $0xfc000000, g, g 174 BIC $0xfc000000, R4, R4 175 ADD.S R12, R0, R0 176 ADC $0, R1, R1 177 ADD.S R14, R6, R6 178 ADC $0, R7, R7 179 MOVW R0>>26, R12 180 MOVW R6>>26, R14 181 ORR R1<<6, R12, R12 182 ORR R7<<6, R14, R14 183 BIC $0xfc000000, R0, R0 184 BIC $0xfc000000, R6, R6 185 ADD R14<<2, R14, R14 186 ADD.S R12, R2, R2 187 ADC $0, R3, R3 188 ADD R14, g, g 189 MOVW R2>>26, R12 190 MOVW g>>26, R14 191 ORR R3<<6, R12, R12 192 BIC $0xfc000000, g, R5 193 BIC $0xfc000000, R2, R7 194 ADD R12, R4, R4 195 ADD R14, R0, R0 196 MOVW R4>>26, R12 197 BIC $0xfc000000, R4, R8 198 ADD R12, R6, R9 199 MOVW 96(R13), R12 200 MOVW 92(R13), R14 201 MOVW R0, R6 202 CMP $32, R12 203 SUB $16, R12, R12 204 MOVW R12, 96(R13) 205 BHS poly1305_blocks_armv6_mainloop 206 207 poly1305_blocks_armv6_done: 208 MOVW 88(R13), R12 209 MOVW R5, 20(R12) 210 MOVW R6, 24(R12) 211 MOVW R7, 28(R12) 212 MOVW R8, 32(R12) 213 MOVW R9, 36(R12) 214 ADD $48, R13, R0 215 MOVM.DA (R0), [R4-R8, R14] 216 RET 217 218 #define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \ 219 MOVBU.P 1(Rsrc), Rtmp; \ 220 MOVBU.P Rtmp, 1(Rdst); \ 221 MOVBU.P 1(Rsrc), Rtmp; \ 222 MOVBU.P Rtmp, 1(Rdst) 223 224 #define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \ 225 MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \ 226 MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) 227 228 // func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key) 229 TEXT ·poly1305_auth_armv6(SB), $196-16 230 // The value 196, just above, is the sum of 64 (the size of the context 231 // structure) and 132 (the amount of stack needed). 232 // 233 // At this point, the stack pointer (R13) has been moved down. It 234 // points to the saved link register and there's 196 bytes of free 235 // space above it. 236 // 237 // The stack for this function looks like: 238 // 239 // +--------------------- 240 // | 241 // | 64 bytes of context structure 242 // | 243 // +--------------------- 244 // | 245 // | 112 bytes for poly1305_blocks_armv6 246 // | 247 // +--------------------- 248 // | 16 bytes of final block, constructed at 249 // | poly1305_finish_ext_armv6_skip8 250 // +--------------------- 251 // | four bytes of saved 'g' 252 // +--------------------- 253 // | lr, saved by prelude <- R13 points here 254 // +--------------------- 255 MOVW g, 4(R13) 256 257 MOVW out+0(FP), R4 258 MOVW m+4(FP), R5 259 MOVW mlen+8(FP), R6 260 MOVW key+12(FP), R7 261 262 ADD $136, R13, R0 // 136 = 4 + 4 + 16 + 112 263 MOVW R7, R1 264 265 // poly1305_init_ext_armv6 will write to the stack from R13+4, but 266 // that's ok because none of the other values have been written yet. 267 BL poly1305_init_ext_armv6<>(SB) 268 BIC.S $15, R6, R2 269 BEQ poly1305_auth_armv6_noblocks 270 ADD $136, R13, R0 271 MOVW R5, R1 272 ADD R2, R5, R5 273 SUB R2, R6, R6 274 BL poly1305_blocks_armv6<>(SB) 275 276 poly1305_auth_armv6_noblocks: 277 ADD $136, R13, R0 278 MOVW R5, R1 279 MOVW R6, R2 280 MOVW R4, R3 281 282 MOVW R0, R5 283 MOVW R1, R6 284 MOVW R2, R7 285 MOVW R3, R8 286 AND.S R2, R2, R2 287 BEQ poly1305_finish_ext_armv6_noremaining 288 EOR R0, R0 289 ADD $8, R13, R9 // 8 = offset to 16 byte scratch space 290 MOVW R0, (R9) 291 MOVW R0, 4(R9) 292 MOVW R0, 8(R9) 293 MOVW R0, 12(R9) 294 WORD $0xe3110003 // TST R1, #3 not working see issue 5921 295 BEQ poly1305_finish_ext_armv6_aligned 296 WORD $0xe3120008 // TST R2, #8 not working see issue 5921 297 BEQ poly1305_finish_ext_armv6_skip8 298 MOVWP_UNALIGNED(R1, R9, g) 299 MOVWP_UNALIGNED(R1, R9, g) 300 301 poly1305_finish_ext_armv6_skip8: 302 WORD $0xe3120004 // TST $4, R2 not working see issue 5921 303 BEQ poly1305_finish_ext_armv6_skip4 304 MOVWP_UNALIGNED(R1, R9, g) 305 306 poly1305_finish_ext_armv6_skip4: 307 WORD $0xe3120002 // TST $2, R2 not working see issue 5921 308 BEQ poly1305_finish_ext_armv6_skip2 309 MOVHUP_UNALIGNED(R1, R9, g) 310 B poly1305_finish_ext_armv6_skip2 311 312 poly1305_finish_ext_armv6_aligned: 313 WORD $0xe3120008 // TST R2, #8 not working see issue 5921 314 BEQ poly1305_finish_ext_armv6_skip8_aligned 315 MOVM.IA.W (R1), [g-R11] 316 MOVM.IA.W [g-R11], (R9) 317 318 poly1305_finish_ext_armv6_skip8_aligned: 319 WORD $0xe3120004 // TST $4, R2 not working see issue 5921 320 BEQ poly1305_finish_ext_armv6_skip4_aligned 321 MOVW.P 4(R1), g 322 MOVW.P g, 4(R9) 323 324 poly1305_finish_ext_armv6_skip4_aligned: 325 WORD $0xe3120002 // TST $2, R2 not working see issue 5921 326 BEQ poly1305_finish_ext_armv6_skip2 327 MOVHU.P 2(R1), g 328 MOVH.P g, 2(R9) 329 330 poly1305_finish_ext_armv6_skip2: 331 WORD $0xe3120001 // TST $1, R2 not working see issue 5921 332 BEQ poly1305_finish_ext_armv6_skip1 333 MOVBU.P 1(R1), g 334 MOVBU.P g, 1(R9) 335 336 poly1305_finish_ext_armv6_skip1: 337 MOVW $1, R11 338 MOVBU R11, 0(R9) 339 MOVW R11, 56(R5) 340 MOVW R5, R0 341 ADD $8, R13, R1 342 MOVW $16, R2 343 BL poly1305_blocks_armv6<>(SB) 344 345 poly1305_finish_ext_armv6_noremaining: 346 MOVW 20(R5), R0 347 MOVW 24(R5), R1 348 MOVW 28(R5), R2 349 MOVW 32(R5), R3 350 MOVW 36(R5), R4 351 MOVW R4>>26, R12 352 BIC $0xfc000000, R4, R4 353 ADD R12<<2, R12, R12 354 ADD R12, R0, R0 355 MOVW R0>>26, R12 356 BIC $0xfc000000, R0, R0 357 ADD R12, R1, R1 358 MOVW R1>>26, R12 359 BIC $0xfc000000, R1, R1 360 ADD R12, R2, R2 361 MOVW R2>>26, R12 362 BIC $0xfc000000, R2, R2 363 ADD R12, R3, R3 364 MOVW R3>>26, R12 365 BIC $0xfc000000, R3, R3 366 ADD R12, R4, R4 367 ADD $5, R0, R6 368 MOVW R6>>26, R12 369 BIC $0xfc000000, R6, R6 370 ADD R12, R1, R7 371 MOVW R7>>26, R12 372 BIC $0xfc000000, R7, R7 373 ADD R12, R2, g 374 MOVW g>>26, R12 375 BIC $0xfc000000, g, g 376 ADD R12, R3, R11 377 MOVW $-(1<<26), R12 378 ADD R11>>26, R12, R12 379 BIC $0xfc000000, R11, R11 380 ADD R12, R4, R9 381 MOVW R9>>31, R12 382 SUB $1, R12 383 AND R12, R6, R6 384 AND R12, R7, R7 385 AND R12, g, g 386 AND R12, R11, R11 387 AND R12, R9, R9 388 MVN R12, R12 389 AND R12, R0, R0 390 AND R12, R1, R1 391 AND R12, R2, R2 392 AND R12, R3, R3 393 AND R12, R4, R4 394 ORR R6, R0, R0 395 ORR R7, R1, R1 396 ORR g, R2, R2 397 ORR R11, R3, R3 398 ORR R9, R4, R4 399 ORR R1<<26, R0, R0 400 MOVW R1>>6, R1 401 ORR R2<<20, R1, R1 402 MOVW R2>>12, R2 403 ORR R3<<14, R2, R2 404 MOVW R3>>18, R3 405 ORR R4<<8, R3, R3 406 MOVW 40(R5), R6 407 MOVW 44(R5), R7 408 MOVW 48(R5), g 409 MOVW 52(R5), R11 410 ADD.S R6, R0, R0 411 ADC.S R7, R1, R1 412 ADC.S g, R2, R2 413 ADC.S R11, R3, R3 414 MOVM.IA [R0-R3], (R8) 415 MOVW R5, R12 416 EOR R0, R0, R0 417 EOR R1, R1, R1 418 EOR R2, R2, R2 419 EOR R3, R3, R3 420 EOR R4, R4, R4 421 EOR R5, R5, R5 422 EOR R6, R6, R6 423 EOR R7, R7, R7 424 MOVM.IA.W [R0-R7], (R12) 425 MOVM.IA [R0-R7], (R12) 426 MOVW 4(R13), g 427 RET