github.com/dotlike13/wemix30_go@v1.8.23/crypto/bn256/cloudflare/mul_amd64.h (about)

     1  #define mul(a0,a1,a2,a3, rb, stack) \
     2  	MOVQ a0, AX \
     3  	MULQ 0+rb \
     4  	MOVQ AX, R8 \
     5  	MOVQ DX, R9 \
     6  	MOVQ a0, AX \
     7  	MULQ 8+rb \
     8  	ADDQ AX, R9 \
     9  	ADCQ $0, DX \
    10  	MOVQ DX, R10 \
    11  	MOVQ a0, AX \
    12  	MULQ 16+rb \
    13  	ADDQ AX, R10 \
    14  	ADCQ $0, DX \
    15  	MOVQ DX, R11 \
    16  	MOVQ a0, AX \
    17  	MULQ 24+rb \
    18  	ADDQ AX, R11 \
    19  	ADCQ $0, DX \
    20  	MOVQ DX, R12 \
    21  	\
    22  	storeBlock(R8,R9,R10,R11, 0+stack) \
    23  	MOVQ R12, 32+stack \
    24  	\
    25  	MOVQ a1, AX \
    26  	MULQ 0+rb \
    27  	MOVQ AX, R8 \
    28  	MOVQ DX, R9 \
    29  	MOVQ a1, AX \
    30  	MULQ 8+rb \
    31  	ADDQ AX, R9 \
    32  	ADCQ $0, DX \
    33  	MOVQ DX, R10 \
    34  	MOVQ a1, AX \
    35  	MULQ 16+rb \
    36  	ADDQ AX, R10 \
    37  	ADCQ $0, DX \
    38  	MOVQ DX, R11 \
    39  	MOVQ a1, AX \
    40  	MULQ 24+rb \
    41  	ADDQ AX, R11 \
    42  	ADCQ $0, DX \
    43  	MOVQ DX, R12 \
    44  	\
    45  	ADDQ 8+stack, R8 \
    46  	ADCQ 16+stack, R9 \
    47  	ADCQ 24+stack, R10 \
    48  	ADCQ 32+stack, R11 \
    49  	ADCQ $0, R12 \
    50  	storeBlock(R8,R9,R10,R11, 8+stack) \
    51  	MOVQ R12, 40+stack \
    52  	\
    53  	MOVQ a2, AX \
    54  	MULQ 0+rb \
    55  	MOVQ AX, R8 \
    56  	MOVQ DX, R9 \
    57  	MOVQ a2, AX \
    58  	MULQ 8+rb \
    59  	ADDQ AX, R9 \
    60  	ADCQ $0, DX \
    61  	MOVQ DX, R10 \
    62  	MOVQ a2, AX \
    63  	MULQ 16+rb \
    64  	ADDQ AX, R10 \
    65  	ADCQ $0, DX \
    66  	MOVQ DX, R11 \
    67  	MOVQ a2, AX \
    68  	MULQ 24+rb \
    69  	ADDQ AX, R11 \
    70  	ADCQ $0, DX \
    71  	MOVQ DX, R12 \
    72  	\
    73  	ADDQ 16+stack, R8 \
    74  	ADCQ 24+stack, R9 \
    75  	ADCQ 32+stack, R10 \
    76  	ADCQ 40+stack, R11 \
    77  	ADCQ $0, R12 \
    78  	storeBlock(R8,R9,R10,R11, 16+stack) \
    79  	MOVQ R12, 48+stack \
    80  	\
    81  	MOVQ a3, AX \
    82  	MULQ 0+rb \
    83  	MOVQ AX, R8 \
    84  	MOVQ DX, R9 \
    85  	MOVQ a3, AX \
    86  	MULQ 8+rb \
    87  	ADDQ AX, R9 \
    88  	ADCQ $0, DX \
    89  	MOVQ DX, R10 \
    90  	MOVQ a3, AX \
    91  	MULQ 16+rb \
    92  	ADDQ AX, R10 \
    93  	ADCQ $0, DX \
    94  	MOVQ DX, R11 \
    95  	MOVQ a3, AX \
    96  	MULQ 24+rb \
    97  	ADDQ AX, R11 \
    98  	ADCQ $0, DX \
    99  	MOVQ DX, R12 \
   100  	\
   101  	ADDQ 24+stack, R8 \
   102  	ADCQ 32+stack, R9 \
   103  	ADCQ 40+stack, R10 \
   104  	ADCQ 48+stack, R11 \
   105  	ADCQ $0, R12 \
   106  	storeBlock(R8,R9,R10,R11, 24+stack) \
   107  	MOVQ R12, 56+stack
   108  
   109  #define gfpReduce(stack) \
   110  	\ // m = (T * N') mod R, store m in R8:R9:R10:R11
   111  	MOVQ ·np+0(SB), AX \
   112  	MULQ 0+stack \
   113  	MOVQ AX, R8 \
   114  	MOVQ DX, R9 \
   115  	MOVQ ·np+0(SB), AX \
   116  	MULQ 8+stack \
   117  	ADDQ AX, R9 \
   118  	ADCQ $0, DX \
   119  	MOVQ DX, R10 \
   120  	MOVQ ·np+0(SB), AX \
   121  	MULQ 16+stack \
   122  	ADDQ AX, R10 \
   123  	ADCQ $0, DX \
   124  	MOVQ DX, R11 \
   125  	MOVQ ·np+0(SB), AX \
   126  	MULQ 24+stack \
   127  	ADDQ AX, R11 \
   128  	\
   129  	MOVQ ·np+8(SB), AX \
   130  	MULQ 0+stack \
   131  	MOVQ AX, R12 \
   132  	MOVQ DX, R13 \
   133  	MOVQ ·np+8(SB), AX \
   134  	MULQ 8+stack \
   135  	ADDQ AX, R13 \
   136  	ADCQ $0, DX \
   137  	MOVQ DX, R14 \
   138  	MOVQ ·np+8(SB), AX \
   139  	MULQ 16+stack \
   140  	ADDQ AX, R14 \
   141  	\
   142  	ADDQ R12, R9 \
   143  	ADCQ R13, R10 \
   144  	ADCQ R14, R11 \
   145  	\
   146  	MOVQ ·np+16(SB), AX \
   147  	MULQ 0+stack \
   148  	MOVQ AX, R12 \
   149  	MOVQ DX, R13 \
   150  	MOVQ ·np+16(SB), AX \
   151  	MULQ 8+stack \
   152  	ADDQ AX, R13 \
   153  	\
   154  	ADDQ R12, R10 \
   155  	ADCQ R13, R11 \
   156  	\
   157  	MOVQ ·np+24(SB), AX \
   158  	MULQ 0+stack \
   159  	ADDQ AX, R11 \
   160  	\
   161  	storeBlock(R8,R9,R10,R11, 64+stack) \
   162  	\
   163  	\ // m * N
   164  	mul(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64+stack, 96+stack) \
   165  	\
   166  	\ // Add the 512-bit intermediate to m*N
   167  	loadBlock(96+stack, R8,R9,R10,R11) \
   168  	loadBlock(128+stack, R12,R13,R14,R15) \
   169  	\
   170  	MOVQ $0, AX \
   171  	ADDQ 0+stack, R8 \
   172  	ADCQ 8+stack, R9 \
   173  	ADCQ 16+stack, R10 \
   174  	ADCQ 24+stack, R11 \
   175  	ADCQ 32+stack, R12 \
   176  	ADCQ 40+stack, R13 \
   177  	ADCQ 48+stack, R14 \
   178  	ADCQ 56+stack, R15 \
   179  	ADCQ $0, AX \
   180  	\
   181  	gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX)