github.com/ebceco/ebc@v1.8.19-0.20190309150932-8cb0b9e06484/crypto/bn256/cloudflare/mul_bmi2_amd64.h (about) 1 #define mulBMI2(a0,a1,a2,a3, rb) \ 2 MOVQ a0, DX \ 3 MOVQ $0, R13 \ 4 MULXQ 0+rb, R8, R9 \ 5 MULXQ 8+rb, AX, R10 \ 6 ADDQ AX, R9 \ 7 MULXQ 16+rb, AX, R11 \ 8 ADCQ AX, R10 \ 9 MULXQ 24+rb, AX, R12 \ 10 ADCQ AX, R11 \ 11 ADCQ $0, R12 \ 12 ADCQ $0, R13 \ 13 \ 14 MOVQ a1, DX \ 15 MOVQ $0, R14 \ 16 MULXQ 0+rb, AX, BX \ 17 ADDQ AX, R9 \ 18 ADCQ BX, R10 \ 19 MULXQ 16+rb, AX, BX \ 20 ADCQ AX, R11 \ 21 ADCQ BX, R12 \ 22 ADCQ $0, R13 \ 23 MULXQ 8+rb, AX, BX \ 24 ADDQ AX, R10 \ 25 ADCQ BX, R11 \ 26 MULXQ 24+rb, AX, BX \ 27 ADCQ AX, R12 \ 28 ADCQ BX, R13 \ 29 ADCQ $0, R14 \ 30 \ 31 MOVQ a2, DX \ 32 MOVQ $0, R15 \ 33 MULXQ 0+rb, AX, BX \ 34 ADDQ AX, R10 \ 35 ADCQ BX, R11 \ 36 MULXQ 16+rb, AX, BX \ 37 ADCQ AX, R12 \ 38 ADCQ BX, R13 \ 39 ADCQ $0, R14 \ 40 MULXQ 8+rb, AX, BX \ 41 ADDQ AX, R11 \ 42 ADCQ BX, R12 \ 43 MULXQ 24+rb, AX, BX \ 44 ADCQ AX, R13 \ 45 ADCQ BX, R14 \ 46 ADCQ $0, R15 \ 47 \ 48 MOVQ a3, DX \ 49 MULXQ 0+rb, AX, BX \ 50 ADDQ AX, R11 \ 51 ADCQ BX, R12 \ 52 MULXQ 16+rb, AX, BX \ 53 ADCQ AX, R13 \ 54 ADCQ BX, R14 \ 55 ADCQ $0, R15 \ 56 MULXQ 8+rb, AX, BX \ 57 ADDQ AX, R12 \ 58 ADCQ BX, R13 \ 59 MULXQ 24+rb, AX, BX \ 60 ADCQ AX, R14 \ 61 ADCQ BX, R15 62 63 #define gfpReduceBMI2() \ 64 \ // m = (T * N') mod R, store m in R8:R9:R10:R11 65 MOVQ ·np+0(SB), DX \ 66 MULXQ 0(SP), R8, R9 \ 67 MULXQ 8(SP), AX, R10 \ 68 ADDQ AX, R9 \ 69 MULXQ 16(SP), AX, R11 \ 70 ADCQ AX, R10 \ 71 MULXQ 24(SP), AX, BX \ 72 ADCQ AX, R11 \ 73 \ 74 MOVQ ·np+8(SB), DX \ 75 MULXQ 0(SP), AX, BX \ 76 ADDQ AX, R9 \ 77 ADCQ BX, R10 \ 78 MULXQ 16(SP), AX, BX \ 79 ADCQ AX, R11 \ 80 MULXQ 8(SP), AX, BX \ 81 ADDQ AX, R10 \ 82 ADCQ BX, R11 \ 83 \ 84 MOVQ ·np+16(SB), DX \ 85 MULXQ 0(SP), AX, BX \ 86 ADDQ AX, R10 \ 87 ADCQ BX, R11 \ 88 MULXQ 8(SP), AX, BX \ 89 ADDQ AX, R11 \ 90 \ 91 MOVQ ·np+24(SB), DX \ 92 MULXQ 0(SP), AX, BX \ 93 ADDQ AX, R11 \ 94 \ 95 storeBlock(R8,R9,R10,R11, 64(SP)) \ 96 \ 97 \ // m * N 98 mulBMI2(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64(SP)) \ 99 \ 100 \ // Add the 512-bit intermediate to m*N 101 MOVQ $0, AX \ 102 ADDQ 0(SP), R8 \ 103 ADCQ 8(SP), R9 \ 104 ADCQ 16(SP), R10 \ 105 ADCQ 24(SP), R11 \ 106 ADCQ 32(SP), R12 \ 107 ADCQ 40(SP), R13 \ 108 ADCQ 48(SP), R14 \ 109 ADCQ 56(SP), R15 \ 110 ADCQ $0, AX \ 111 \ 112 gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX)