github.com/ethereum/go-ethereum@v1.16.1/crypto/bn256/cloudflare/mul_arm64.h (about) 1 // mul multiplies two 256-bit numbers in little-endian order. 2 // The inputs are (R1,R2,R3,R4) times (R5,R6,R7,R8) 3 // and the product is stored in (c0,c1,c2,c3,c4,c5,c6,c7). 4 // Note that the input registers (R1,R2,R3) are overwritten. 5 #define mul(c0,c1,c2,c3,c4,c5,c6,c7) \ 6 MUL R1, R5, c0 \ 7 UMULH R1, R5, c1 \ 8 MUL R1, R6, R0 \ 9 ADDS R0, c1 \ 10 UMULH R1, R6, c2 \ 11 MUL R1, R7, R0 \ 12 ADCS R0, c2 \ 13 UMULH R1, R7, c3 \ 14 MUL R1, R8, R0 \ 15 ADCS R0, c3 \ 16 UMULH R1, R8, c4 \ 17 ADCS ZR, c4 \ 18 \ 19 MUL R2, R5, R1 \ 20 UMULH R2, R5, R26 \ 21 MUL R2, R6, R0 \ 22 ADDS R0, R26 \ 23 UMULH R2, R6, c6 \ 24 MUL R2, R7, R0 \ 25 ADCS R0, c6 \ 26 UMULH R2, R7, c7 \ 27 MUL R2, R8, R0 \ 28 ADCS R0, c7 \ 29 UMULH R2, R8, c5 \ 30 ADCS ZR, c5 \ 31 ADDS R1, c1 \ 32 ADCS R26, c2 \ 33 ADCS c6, c3 \ 34 ADCS c7, c4 \ 35 ADCS ZR, c5 \ 36 \ 37 MUL R3, R5, R1 \ 38 UMULH R3, R5, R26 \ 39 MUL R3, R6, R0 \ 40 ADDS R0, R26 \ 41 UMULH R3, R6, R2 \ 42 MUL R3, R7, R0 \ 43 ADCS R0, R2 \ 44 UMULH R3, R7, c7 \ 45 MUL R3, R8, R0 \ 46 ADCS R0, c7 \ 47 UMULH R3, R8, c6 \ 48 ADCS ZR, c6 \ 49 ADDS R1, c2 \ 50 ADCS R26, c3 \ 51 ADCS R2, c4 \ 52 ADCS c7, c5 \ 53 ADCS ZR, c6 \ 54 \ 55 MUL R4, R5, R1 \ 56 UMULH R4, R5, R26 \ 57 MUL R4, R6, R0 \ 58 ADDS R0, R26 \ 59 UMULH R4, R6, R2 \ 60 MUL R4, R7, R0 \ 61 ADCS R0, R2 \ 62 UMULH R4, R7, R3 \ 63 MUL R4, R8, R0 \ 64 ADCS R0, R3 \ 65 UMULH R4, R8, c7 \ 66 ADCS ZR, c7 \ 67 ADDS R1, c3 \ 68 ADCS R26, c4 \ 69 ADCS R2, c5 \ 70 ADCS R3, c6 \ 71 ADCS ZR, c7 72 73 #define gfpReduce() \ 74 \ // m = (T * N') mod R, store m in R1:R2:R3:R4 75 MOVD ·np+0(SB), R17 \ 76 MOVD ·np+8(SB), R25 \ 77 MOVD ·np+16(SB), R19 \ 78 MOVD ·np+24(SB), R20 \ 79 \ 80 MUL R9, R17, R1 \ 81 UMULH R9, R17, R2 \ 82 MUL R9, R25, R0 \ 83 ADDS R0, R2 \ 84 UMULH R9, R25, R3 \ 85 MUL R9, R19, R0 \ 86 ADCS R0, R3 \ 87 UMULH R9, R19, R4 \ 88 MUL R9, R20, R0 \ 89 ADCS R0, R4 \ 90 \ 91 MUL R10, R17, R21 \ 92 UMULH R10, R17, R22 \ 93 MUL R10, R25, R0 \ 94 ADDS R0, R22 \ 95 UMULH R10, R25, R23 \ 96 MUL R10, R19, R0 \ 97 ADCS R0, R23 \ 98 ADDS R21, R2 \ 99 ADCS R22, R3 \ 100 ADCS R23, R4 \ 101 \ 102 MUL R11, R17, R21 \ 103 UMULH R11, R17, R22 \ 104 MUL R11, R25, R0 \ 105 ADDS R0, R22 \ 106 ADDS R21, R3 \ 107 ADCS R22, R4 \ 108 \ 109 MUL R12, R17, R21 \ 110 ADDS R21, R4 \ 111 \ 112 \ // m * N 113 loadModulus(R5,R6,R7,R8) \ 114 mul(R17,R25,R19,R20,R21,R22,R23,R24) \ 115 \ 116 \ // Add the 512-bit intermediate to m*N 117 MOVD ZR, R0 \ 118 ADDS R9, R17 \ 119 ADCS R10, R25 \ 120 ADCS R11, R19 \ 121 ADCS R12, R20 \ 122 ADCS R13, R21 \ 123 ADCS R14, R22 \ 124 ADCS R15, R23 \ 125 ADCS R16, R24 \ 126 ADCS ZR, R0 \ 127 \ 128 \ // Our output is R21:R22:R23:R24. Reduce mod p if necessary. 129 SUBS R5, R21, R10 \ 130 SBCS R6, R22, R11 \ 131 SBCS R7, R23, R12 \ 132 SBCS R8, R24, R13 \ 133 \ 134 CSEL CS, R10, R21, R1 \ 135 CSEL CS, R11, R22, R2 \ 136 CSEL CS, R12, R23, R3 \ 137 CSEL CS, R13, R24, R4