github.com/ethereum/go-ethereum@v1.16.1/crypto/bn256/cloudflare/mul_arm64.h (about)

     1  // mul multiplies two 256-bit numbers in little-endian order.
     2  // The inputs are (R1,R2,R3,R4) times (R5,R6,R7,R8)
     3  // and the product is stored in (c0,c1,c2,c3,c4,c5,c6,c7).
     4  // Note that the input registers (R1,R2,R3) are overwritten.
     5  #define mul(c0,c1,c2,c3,c4,c5,c6,c7) \
     6  	MUL R1, R5, c0 \
     7  	UMULH R1, R5, c1 \
     8  	MUL R1, R6, R0 \
     9  	ADDS R0, c1 \
    10  	UMULH R1, R6, c2 \
    11  	MUL R1, R7, R0 \
    12  	ADCS R0, c2 \
    13  	UMULH R1, R7, c3 \
    14  	MUL R1, R8, R0 \
    15  	ADCS R0, c3 \
    16  	UMULH R1, R8, c4 \
    17  	ADCS ZR, c4 \
    18  	\
    19  	MUL R2, R5, R1 \
    20  	UMULH R2, R5, R26 \
    21  	MUL R2, R6, R0 \
    22  	ADDS R0, R26 \
    23  	UMULH R2, R6, c6 \
    24  	MUL R2, R7, R0 \
    25  	ADCS R0, c6 \
    26  	UMULH R2, R7, c7 \
    27  	MUL R2, R8, R0 \
    28  	ADCS R0, c7 \
    29  	UMULH R2, R8, c5 \
    30  	ADCS ZR, c5 \
    31  	ADDS R1, c1 \
    32  	ADCS R26, c2 \
    33  	ADCS c6, c3 \
    34  	ADCS c7, c4 \
    35  	ADCS  ZR, c5 \
    36  	\
    37  	MUL R3, R5, R1 \
    38  	UMULH R3, R5, R26 \
    39  	MUL R3, R6, R0 \
    40  	ADDS R0, R26 \
    41  	UMULH R3, R6, R2 \
    42  	MUL R3, R7, R0 \
    43  	ADCS R0, R2 \
    44  	UMULH R3, R7, c7 \
    45  	MUL R3, R8, R0 \
    46  	ADCS R0, c7 \
    47  	UMULH R3, R8, c6 \
    48  	ADCS ZR, c6 \
    49  	ADDS R1, c2 \
    50  	ADCS R26, c3 \
    51  	ADCS R2, c4 \
    52  	ADCS c7, c5 \
    53  	ADCS  ZR, c6 \
    54  	\
    55  	MUL R4, R5, R1 \
    56  	UMULH R4, R5, R26 \
    57  	MUL R4, R6, R0 \
    58  	ADDS R0, R26 \
    59  	UMULH R4, R6, R2 \
    60  	MUL R4, R7, R0 \
    61  	ADCS R0, R2 \
    62  	UMULH R4, R7, R3 \
    63  	MUL R4, R8, R0 \
    64  	ADCS R0, R3 \
    65  	UMULH R4, R8, c7 \
    66  	ADCS ZR, c7 \
    67  	ADDS R1, c3 \
    68  	ADCS R26, c4 \
    69  	ADCS R2, c5 \
    70  	ADCS R3, c6 \
    71  	ADCS  ZR, c7
    72  
    73  #define gfpReduce() \
    74  	\ // m = (T * N') mod R, store m in R1:R2:R3:R4
    75  	MOVD ·np+0(SB), R17 \
    76  	MOVD ·np+8(SB), R25 \
    77  	MOVD ·np+16(SB), R19 \
    78  	MOVD ·np+24(SB), R20 \
    79  	\
    80  	MUL R9, R17, R1 \
    81  	UMULH R9, R17, R2 \
    82  	MUL R9, R25, R0 \
    83  	ADDS R0, R2 \
    84  	UMULH R9, R25, R3 \
    85  	MUL R9, R19, R0 \
    86  	ADCS R0, R3 \
    87  	UMULH R9, R19, R4 \
    88  	MUL R9, R20, R0 \
    89  	ADCS R0, R4 \
    90  	\
    91  	MUL R10, R17, R21 \
    92  	UMULH R10, R17, R22 \
    93  	MUL R10, R25, R0 \
    94  	ADDS R0, R22 \
    95  	UMULH R10, R25, R23 \
    96  	MUL R10, R19, R0 \
    97  	ADCS R0, R23 \
    98  	ADDS R21, R2 \
    99  	ADCS R22, R3 \
   100  	ADCS R23, R4 \
   101  	\
   102  	MUL R11, R17, R21 \
   103  	UMULH R11, R17, R22 \
   104  	MUL R11, R25, R0 \
   105  	ADDS R0, R22 \
   106  	ADDS R21, R3 \
   107  	ADCS R22, R4 \
   108  	\
   109  	MUL R12, R17, R21 \
   110  	ADDS R21, R4 \
   111  	\
   112  	\ // m * N
   113  	loadModulus(R5,R6,R7,R8) \
   114  	mul(R17,R25,R19,R20,R21,R22,R23,R24) \
   115  	\
   116  	\ // Add the 512-bit intermediate to m*N
   117  	MOVD  ZR, R0 \
   118  	ADDS  R9, R17 \
   119  	ADCS R10, R25 \
   120  	ADCS R11, R19 \
   121  	ADCS R12, R20 \
   122  	ADCS R13, R21 \
   123  	ADCS R14, R22 \
   124  	ADCS R15, R23 \
   125  	ADCS R16, R24 \
   126  	ADCS  ZR, R0 \
   127  	\
   128  	\ // Our output is R21:R22:R23:R24. Reduce mod p if necessary.
   129  	SUBS R5, R21, R10 \
   130  	SBCS R6, R22, R11 \
   131  	SBCS R7, R23, R12 \
   132  	SBCS R8, R24, R13 \
   133  	\
   134  	CSEL CS, R10, R21, R1 \
   135  	CSEL CS, R11, R22, R2 \
   136  	CSEL CS, R12, R23, R3 \
   137  	CSEL CS, R13, R24, R4