github.com/cloudflare/circl@v1.5.0/dh/csidh/fp511_amd64.s (about) 1 //go:build amd64 && !purego 2 // +build amd64,!purego 3 4 #include "textflag.h" 5 6 // Multiplies 512-bit value by 64-bit value. Uses MULQ instruction to 7 // multiply 2 64-bit values. 8 // 9 // Result: x = (y * z) mod 2^512 10 // 11 // Registers used: AX, CX, DX, SI, DI, R8 12 // 13 // func mul512Amd64(a, b *Fp, c uint64) 14 TEXT ·mul512Amd64(SB), NOSPLIT, $0-24 15 MOVQ a+0(FP), DI // result 16 MOVQ b+8(FP), SI // multiplicand 17 18 // Check whether to use optimized implementation 19 CMPB ·hasBMI2(SB), $1 20 JE mul512_mulx 21 22 MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ 23 MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0] 24 MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1] 25 MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2] 26 MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3] 27 MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4] 28 MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5] 29 MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6] 30 MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; MOVQ AX, 56(DI) //x[7] 31 RET 32 33 // Optimized for CPUs with BMI2 34 mul512_mulx: 35 MOVQ c+16(FP), DX // 64 bit multiplier, used by MULX 36 MULXQ 0(SI), AX, R10; MOVQ AX, 0(DI) // x[0] 37 MULXQ 8(SI), AX, R11; ADDQ R10, AX; MOVQ AX, 8(DI) // x[1] 38 MULXQ 16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2] 39 MULXQ 24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3] 40 MULXQ 32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4] 41 MULXQ 40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5] 42 MULXQ 48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6] 43 MULXQ 56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7] 44 RET 45 46 TEXT ·cswap512Amd64(SB),NOSPLIT,$0-17 47 MOVQ x+0(FP), DI 48 MOVQ y+8(FP), SI 49 MOVBLZX choice+16(FP), AX // AL = 0 or 1 50 51 // Make AX, so that either all bits are set or non 52 // AX = 0 or 1 53 NEGQ AX 54 55 // Fill xmm15. After this step first half of XMM15 is 56 // just zeros and second half is whatever in AX 57 MOVQ AX, X15 58 59 // Copy lower double word everywhere else. So that 60 // XMM15=AL|AL|AL|AL. As AX has either all bits set 61 // or non result will be that XMM15 has also either 62 // all bits set or non of them. 63 PSHUFD $0, X15, X15 64 65 #ifndef CSWAP_BLOCK 66 #define CSWAP_BLOCK(idx) \ 67 MOVOU (idx*16)(DI), X0 \ 68 MOVOU (idx*16)(SI), X1 \ 69 \ // X2 = mask & (X0 ^ X1) 70 MOVO X1, X2 \ 71 PXOR X0, X2 \ 72 PAND X15, X2 \ 73 \ 74 PXOR X2, X0 \ 75 PXOR X2, X1 \ 76 \ 77 MOVOU X0, (idx*16)(DI) \ 78 MOVOU X1, (idx*16)(SI) 79 #endif 80 81 CSWAP_BLOCK(0) 82 CSWAP_BLOCK(1) 83 CSWAP_BLOCK(2) 84 CSWAP_BLOCK(3) 85 86 RET 87 88 // mulAsm implements montgomery multiplication interleaved with 89 // montgomery reduction. It uses MULX and ADCX/ADOX instructions. 90 // Implementation specific to 511-bit prime 'p' 91 // 92 // func mulBmiAsm(res, x, y *fp) 93 TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24 94 95 MOVQ x+8(FP), DI // multiplicand 96 MOVQ y+16(FP), SI // multiplier 97 98 XORQ R8, R8 99 XORQ R9, R9 100 XORQ R10, R10 101 XORQ R11, R11 102 XORQ R12, R12 103 XORQ R13, R13 104 XORQ R14, R14 105 XORQ CX, CX 106 107 MOVQ BP, 0(SP) // push: BP is Callee-save. 108 XORQ BP, BP 109 110 // Uses BMI2 (MULX) 111 #ifdef MULS_MULX_512 112 #undef MULS_MULX_512 113 #endif 114 #define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \ 115 \ // Reduction step 116 MOVQ ( 0)(SI), DX \ 117 MULXQ ( 8*idx)(DI), DX, AX \ 118 ADDQ r0, DX \ 119 MOVQ ·pNegInv(SB), AX \ 120 MULXQ AX, DX, AX \ 121 \ 122 XORQ AX, AX; \ 123 MOVQ ·p+ 0(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \ 124 MOVQ ·p+ 8(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \ 125 MOVQ ·p+16(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \ 126 MOVQ ·p+24(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \ 127 MOVQ ·p+32(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \ 128 MOVQ ·p+40(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \ 129 MOVQ ·p+48(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \ 130 MOVQ ·p+56(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \ 131 MOVQ $0, AX; ;;;;;;;;;;;;;;;;;;;;;;; ADOXQ AX, r8; \ 132 \ // Multiplication step 133 MOVQ (8*idx)(DI), DX \ 134 \ 135 XORQ AX, AX \ 136 MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \ 137 MULXQ ( 8)(SI), AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \ 138 MULXQ (16)(SI), AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \ 139 MULXQ (24)(SI), AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \ 140 MULXQ (32)(SI), AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \ 141 MULXQ (40)(SI), AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \ 142 MULXQ (48)(SI), AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \ 143 MULXQ (56)(SI), AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \ 144 MOVQ $0, AX ; ADOXQ AX, r8; 145 146 MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, CX, BP) 147 MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, CX, BP, R8) 148 MULS_MULX_512(2, R10, R11, R12, R13, R14, CX, BP, R8, R9) 149 MULS_MULX_512(3, R11, R12, R13, R14, CX, BP, R8, R9, R10) 150 MULS_MULX_512(4, R12, R13, R14, CX, BP, R8, R9, R10, R11) 151 MULS_MULX_512(5, R13, R14, CX, BP, R8, R9, R10, R11, R12) 152 MULS_MULX_512(6, R14, CX, BP, R8, R9, R10, R11, R12, R13) 153 MULS_MULX_512(7, CX, BP, R8, R9, R10, R11, R12, R13, R14) 154 #undef MULS_MULX_512 155 156 MOVQ res+0(FP), DI 157 MOVQ BP, ( 0)(DI) 158 MOVQ R8, ( 8)(DI) 159 MOVQ R9, (16)(DI) 160 MOVQ R10, (24)(DI) 161 MOVQ R11, (32)(DI) 162 MOVQ R12, (40)(DI) 163 MOVQ R13, (48)(DI) 164 MOVQ R14, (56)(DI) 165 MOVQ 0(SP), BP // pop: BP is Callee-save. 166 167 // NOW DI needs to be reduced if > p 168 RET