github.com/ebceco/ebc@v1.8.19-0.20190309150932-8cb0b9e06484/crypto/bn256/cloudflare/mul_bmi2_amd64.h (about)

     1  #define mulBMI2(a0,a1,a2,a3, rb) \
     2  	MOVQ a0, DX \
     3  	MOVQ $0, R13 \
     4  	MULXQ 0+rb, R8, R9 \
     5  	MULXQ 8+rb, AX, R10 \
     6  	ADDQ AX, R9 \
     7  	MULXQ 16+rb, AX, R11 \
     8  	ADCQ AX, R10 \
     9  	MULXQ 24+rb, AX, R12 \
    10  	ADCQ AX, R11 \
    11  	ADCQ $0, R12 \
    12  	ADCQ $0, R13 \
    13  	\
    14  	MOVQ a1, DX \
    15  	MOVQ $0, R14 \
    16  	MULXQ 0+rb, AX, BX \
    17  	ADDQ AX, R9 \
    18  	ADCQ BX, R10 \
    19  	MULXQ 16+rb, AX, BX \
    20  	ADCQ AX, R11 \
    21  	ADCQ BX, R12 \
    22  	ADCQ $0, R13 \
    23  	MULXQ 8+rb, AX, BX \
    24  	ADDQ AX, R10 \
    25  	ADCQ BX, R11 \
    26  	MULXQ 24+rb, AX, BX \
    27  	ADCQ AX, R12 \
    28  	ADCQ BX, R13 \
    29  	ADCQ $0, R14 \
    30  	\
    31  	MOVQ a2, DX \
    32  	MOVQ $0, R15 \
    33  	MULXQ 0+rb, AX, BX \
    34  	ADDQ AX, R10 \
    35  	ADCQ BX, R11 \
    36  	MULXQ 16+rb, AX, BX \
    37  	ADCQ AX, R12 \
    38  	ADCQ BX, R13 \
    39  	ADCQ $0, R14 \
    40  	MULXQ 8+rb, AX, BX \
    41  	ADDQ AX, R11 \
    42  	ADCQ BX, R12 \
    43  	MULXQ 24+rb, AX, BX \
    44  	ADCQ AX, R13 \
    45  	ADCQ BX, R14 \
    46  	ADCQ $0, R15 \
    47  	\
    48  	MOVQ a3, DX \
    49  	MULXQ 0+rb, AX, BX \
    50  	ADDQ AX, R11 \
    51  	ADCQ BX, R12 \
    52  	MULXQ 16+rb, AX, BX \
    53  	ADCQ AX, R13 \
    54  	ADCQ BX, R14 \
    55  	ADCQ $0, R15 \
    56  	MULXQ 8+rb, AX, BX \
    57  	ADDQ AX, R12 \
    58  	ADCQ BX, R13 \
    59  	MULXQ 24+rb, AX, BX \
    60  	ADCQ AX, R14 \
    61  	ADCQ BX, R15
    62  
    63  #define gfpReduceBMI2() \
    64  	\ // m = (T * N') mod R, store m in R8:R9:R10:R11
    65  	MOVQ ·np+0(SB), DX \
    66  	MULXQ 0(SP), R8, R9 \
    67  	MULXQ 8(SP), AX, R10 \
    68  	ADDQ AX, R9 \
    69  	MULXQ 16(SP), AX, R11 \
    70  	ADCQ AX, R10 \
    71  	MULXQ 24(SP), AX, BX \
    72  	ADCQ AX, R11 \
    73  	\
    74  	MOVQ ·np+8(SB), DX \
    75  	MULXQ 0(SP), AX, BX \
    76  	ADDQ AX, R9 \
    77  	ADCQ BX, R10 \
    78  	MULXQ 16(SP), AX, BX \
    79  	ADCQ AX, R11 \
    80  	MULXQ 8(SP), AX, BX \
    81  	ADDQ AX, R10 \
    82  	ADCQ BX, R11 \
    83  	\
    84  	MOVQ ·np+16(SB), DX \
    85  	MULXQ 0(SP), AX, BX \
    86  	ADDQ AX, R10 \
    87  	ADCQ BX, R11 \
    88  	MULXQ 8(SP), AX, BX \
    89  	ADDQ AX, R11 \
    90  	\
    91  	MOVQ ·np+24(SB), DX \
    92  	MULXQ 0(SP), AX, BX \
    93  	ADDQ AX, R11 \
    94  	\
    95  	storeBlock(R8,R9,R10,R11, 64(SP)) \
    96  	\
    97  	\ // m * N
    98  	mulBMI2(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64(SP)) \
    99  	\
   100  	\ // Add the 512-bit intermediate to m*N
   101  	MOVQ $0, AX \
   102  	ADDQ 0(SP), R8 \
   103  	ADCQ 8(SP), R9 \
   104  	ADCQ 16(SP), R10 \
   105  	ADCQ 24(SP), R11 \
   106  	ADCQ 32(SP), R12 \
   107  	ADCQ 40(SP), R13 \
   108  	ADCQ 48(SP), R14 \
   109  	ADCQ 56(SP), R15 \
   110  	ADCQ $0, AX \
   111  	\
   112  	gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX)