github.com/theQRL/go-zond@v0.1.1/crypto/bn256/cloudflare/gfp_amd64.s (about)

     1  // +build amd64,!generic
     2  
     3  #define storeBlock(a0,a1,a2,a3, r) \
     4  	MOVQ a0,  0+r \
     5  	MOVQ a1,  8+r \
     6  	MOVQ a2, 16+r \
     7  	MOVQ a3, 24+r
     8  
     9  #define loadBlock(r, a0,a1,a2,a3) \
    10  	MOVQ  0+r, a0 \
    11  	MOVQ  8+r, a1 \
    12  	MOVQ 16+r, a2 \
    13  	MOVQ 24+r, a3
    14  
    15  #define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
    16  	\ // b = a-p
    17  	MOVQ a0, b0 \
    18  	MOVQ a1, b1 \
    19  	MOVQ a2, b2 \
    20  	MOVQ a3, b3 \
    21  	MOVQ a4, b4 \
    22  	\
    23  	SUBQ ·p2+0(SB), b0 \
    24  	SBBQ ·p2+8(SB), b1 \
    25  	SBBQ ·p2+16(SB), b2 \
    26  	SBBQ ·p2+24(SB), b3 \
    27  	SBBQ $0, b4 \
    28  	\
    29  	\ // if b is negative then return a
    30  	\ // else return b
    31  	CMOVQCC b0, a0 \
    32  	CMOVQCC b1, a1 \
    33  	CMOVQCC b2, a2 \
    34  	CMOVQCC b3, a3
    35  
    36  #include "mul_amd64.h"
    37  #include "mul_bmi2_amd64.h"
    38  
    39  TEXT ·gfpNeg(SB),0,$0-16
    40  	MOVQ ·p2+0(SB), R8
    41  	MOVQ ·p2+8(SB), R9
    42  	MOVQ ·p2+16(SB), R10
    43  	MOVQ ·p2+24(SB), R11
    44  
    45  	MOVQ a+8(FP), DI
    46  	SUBQ 0(DI), R8
    47  	SBBQ 8(DI), R9
    48  	SBBQ 16(DI), R10
    49  	SBBQ 24(DI), R11
    50  
    51  	MOVQ $0, AX
    52  	gfpCarry(R8,R9,R10,R11,AX, R12,R13,R14,CX,BX)
    53  
    54  	MOVQ c+0(FP), DI
    55  	storeBlock(R8,R9,R10,R11, 0(DI))
    56  	RET
    57  
    58  TEXT ·gfpAdd(SB),0,$0-24
    59  	MOVQ a+8(FP), DI
    60  	MOVQ b+16(FP), SI
    61  
    62  	loadBlock(0(DI), R8,R9,R10,R11)
    63  	MOVQ $0, R12
    64  
    65  	ADDQ  0(SI), R8
    66  	ADCQ  8(SI), R9
    67  	ADCQ 16(SI), R10
    68  	ADCQ 24(SI), R11
    69  	ADCQ $0, R12
    70  
    71  	gfpCarry(R8,R9,R10,R11,R12, R13,R14,CX,AX,BX)
    72  
    73  	MOVQ c+0(FP), DI
    74  	storeBlock(R8,R9,R10,R11, 0(DI))
    75  	RET
    76  
    77  TEXT ·gfpSub(SB),0,$0-24
    78  	MOVQ a+8(FP), DI
    79  	MOVQ b+16(FP), SI
    80  
    81  	loadBlock(0(DI), R8,R9,R10,R11)
    82  
    83  	MOVQ ·p2+0(SB), R12
    84  	MOVQ ·p2+8(SB), R13
    85  	MOVQ ·p2+16(SB), R14
    86  	MOVQ ·p2+24(SB), CX
    87  	MOVQ $0, AX
    88  
    89  	SUBQ  0(SI), R8
    90  	SBBQ  8(SI), R9
    91  	SBBQ 16(SI), R10
    92  	SBBQ 24(SI), R11
    93  
    94  	CMOVQCC AX, R12
    95  	CMOVQCC AX, R13
    96  	CMOVQCC AX, R14
    97  	CMOVQCC AX, CX
    98  
    99  	ADDQ R12, R8
   100  	ADCQ R13, R9
   101  	ADCQ R14, R10
   102  	ADCQ CX, R11
   103  
   104  	MOVQ c+0(FP), DI
   105  	storeBlock(R8,R9,R10,R11, 0(DI))
   106  	RET
   107  
   108  TEXT ·gfpMul(SB),0,$160-24
   109  	MOVQ a+8(FP), DI
   110  	MOVQ b+16(FP), SI
   111  
   112  	// Jump to a slightly different implementation if MULX isn't supported.
   113  	CMPB ·hasBMI2(SB), $0
   114  	JE   nobmi2Mul
   115  
   116  	mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI))
   117  	storeBlock( R8, R9,R10,R11,  0(SP))
   118  	storeBlock(R12,R13,R14,CX, 32(SP))
   119  	gfpReduceBMI2()
   120  	JMP end
   121  
   122  nobmi2Mul:
   123  	mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP))
   124  	gfpReduce(0(SP))
   125  
   126  end:
   127  	MOVQ c+0(FP), DI
   128  	storeBlock(R12,R13,R14,CX, 0(DI))
   129  	RET