github.com/emmansun/gmsm@v0.29.1/internal/bigmod/nat_arm64.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 9 // func addMulVVW256(z, x *uint, y uint) (c uint) 10 TEXT ·addMulVVW256(SB), $0-32 11 MOVD $4, R0 12 JMP addMulVVWy(SB) 13 14 // func addMulVVW1024(z, x *uint, y uint) (c uint) 15 TEXT ·addMulVVW1024(SB), $0-32 16 MOVD $16, R0 17 JMP addMulVVWy(SB) 18 19 // func addMulVVW1536(z, x *uint, y uint) (c uint) 20 TEXT ·addMulVVW1536(SB), $0-32 21 MOVD $24, R0 22 JMP addMulVVWy(SB) 23 24 // func addMulVVW2048(z, x *uint, y uint) (c uint) 25 TEXT ·addMulVVW2048(SB), $0-32 26 MOVD $32, R0 27 JMP addMulVVWy(SB) 28 29 TEXT addMulVVWy(SB), NOFRAME|NOSPLIT, $0 30 MOVD z+0(FP), R1 31 MOVD x+8(FP), R2 32 MOVD y+16(FP), R3 33 MOVD $0, R4 34 35 // The main loop of this code operates on a block of 4 words every iteration 36 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] 37 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next 38 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. 39 loop: 40 CBZ R0, done 41 42 LDP.P 16(R2), (R5, R6) 43 LDP.P 16(R2), (R7, R8) 44 45 LDP (R1), (R9, R10) 46 ADDS R4, R9 47 MUL R6, R3, R14 48 ADCS R14, R10 49 MUL R7, R3, R15 50 LDP 16(R1), (R11, R12) 51 ADCS R15, R11 52 MUL R8, R3, R16 53 ADCS R16, R12 54 UMULH R8, R3, R20 55 ADC $0, R20 56 57 MUL R5, R3, R13 58 ADDS R13, R9 59 UMULH R5, R3, R17 60 ADCS R17, R10 61 UMULH R6, R3, R21 62 STP.P (R9, R10), 16(R1) 63 ADCS R21, R11 64 UMULH R7, R3, R19 65 ADCS R19, R12 66 STP.P (R11, R12), 16(R1) 67 ADC $0, R20, R4 68 69 SUB $4, R0 70 B loop 71 72 done: 73 MOVD R4, c+24(FP) 74 RET