github.com/cloudflare/circl@v1.5.0/ecc/fourq/fq_amd64.h (about) 1 #include "fp_amd64.h" 2 3 #define _fqAdd(c,a,b) \ 4 _fpAdd( 0+c, 0+a, 0+b) \ 5 _fpAdd(16+c,16+a,16+b) 6 7 #define _fqSub(c,a,b) \ 8 _fpSub( 0+c, 0+a, 0+b) \ 9 _fpSub(16+c,16+a,16+b) 10 11 #define _fqMulBmi2(c, a, b) \ 12 MOVL $0, R15 \ 13 \ // T0 = a0 * b0, R11:R10:R9:R8 <- 0+ra:8+ra * 0+rb:8+rb 14 MOVQ 0+b, DX \ 15 MULXQ 0+a, R8, R9 \ 16 MULXQ 8+a, R10, AX \ 17 ADDQ R10, R9 \ 18 MOVQ 8+b, DX \ 19 MULXQ 8+a, R10, R11 \ 20 ADCQ AX, R10 \ 21 MULXQ 0+a, DX, AX \ 22 ADCQ $0, R11 \ 23 ADDQ DX, R9 \ 24 \ 25 \ // T1 = a1 * b1, R15:R14:R13:R12 <- 16+ra:24+ra * 16+rb:24+rb 26 MOVQ 16+b, DX \ 27 MULXQ 16+a, R12, R13 \ 28 ADCQ AX, R10 \ 29 MULXQ 24+a, R14, AX \ 30 ADCQ $0, R11 \ 31 MOVQ 24+b, DX \ 32 ADDQ R14, R13 \ 33 MULXQ 24+a, R14, R15 \ 34 ADCQ AX, R14 \ 35 ADCQ $0, R15 \ 36 MULXQ 16+a, DX, AX \ 37 ADDQ DX, R13 \ 38 ADCQ AX, R14 \ 39 ADCQ $0, R15 \ 40 \ 41 \ // c0 = T0 - T1 = a0*b0 - a1*b1 42 SUBQ R12, R8 \ 43 SBBQ R13, R9 \ 44 SBBQ R14, R10 \ 45 SBBQ R15, R11 \ 46 \ 47 SHLQ $1, R10, R11 \ 48 SHLQ $1, R9, R10 \ 49 MOVQ 16+b, DX \ 50 BTRQ $63, R9 \ 51 \ 52 \ // T0 = a0 * b1, R15:R14:R13:R12 <- 0+ra:8+ra * 16+rb:24+rb 53 MULXQ 0+a, R12, R13 \ 54 BTRQ $63, R11 \ 55 SBBQ $0, R10 \ 56 SBBQ $0, R11 \ 57 MULXQ 8+a, R14, AX \ 58 ADDQ R14, R13 \ 59 MOVQ 24+b, DX \ 60 MULXQ 8+a, R14, R15 \ 61 ADCQ AX, R14 \ 62 ADCQ $0, R15 \ 63 MULXQ 0+a, DX, AX \ 64 ADDQ DX, R13 \ 65 ADCQ AX, R14 \ 66 ADCQ $0, R15 \ 67 \ 68 \ // Reducing and storing c0 69 ADDQ R8, R10 \ 70 ADCQ R9, R11 \ 71 BTRQ $63, R11 \ 72 ADCQ $0, R10 \ 73 ADCQ $0, R11 \ 74 \ 75 \ // T1 = a1 * b0, R12:R11:R10:R9 <- 16+ra:24+ra * 0+rb:8+rb 76 MOVQ 0+b, DX \ 77 MULXQ 16+a, R8, R9 \ 78 MOVQ R10, 0+c \ 79 MULXQ 24+a, R10, AX \ 80 ADDQ R10, R9 \ 81 MOVQ 8+b, DX \ 82 MOVQ R11, 8+c \ 83 MULXQ 24+a, R10, R11 \ 84 ADCQ AX, R10 \ 85 ADCQ $0, R11 \ 86 MULXQ 16+a, DX, AX \ 87 ADDQ DX, R9 \ 88 ADCQ AX, R10 \ 89 ADCQ $0, R11 \ 90 \ 91 \ // c1 = T0 + T1 = a0*b1 + a1*b0 92 ADDQ R12, R8 \ 93 ADCQ R13, R9 \ 94 ADCQ R14, R10 \ 95 ADCQ R15, R11 \ 96 \ 97 \ // Reducing and storing c1 98 SHLQ $1, R10, R11 \ 99 SHLQ $1, R9, R10 \ 100 BTRQ $63, R9 \ 101 BTRQ $63, R11 \ 102 ADCQ R10, R8 \ 103 ADCQ R11, R9 \ 104 BTRQ $63, R9 \ 105 ADCQ $0, R8 \ 106 ADCQ $0, R9 \ 107 MOVQ R8, 16+c \ 108 MOVQ R9, 24+c 109 110 #define _fqMulLeg(c, a, b) \ 111 _fpMulLeg(R10, R9, R8, 0+a, 0+b) \ 112 _fpMulLeg(R13,R12,R11,16+a,16+b) \ 113 MOVQ $0,R14 \ 114 SUBQ R11, R8 \ 115 SBBQ R12, R9 \ 116 SBBQ R13,R10 \ 117 SBBQ $0,R14 \ 118 SHLQ $1,R10 \ 119 BTRQ $63, R9 \ 120 ADCQ R10, R8 \ 121 ADCQ R14, R9 \ 122 MOVQ R8, R14 \ 123 MOVQ R9, R15 \ 124 _fpMulLeg(R10, R9, R8, 0+a,16+b) \ 125 _fpMulLeg(R13,R12,R11,16+a, 0+b) \ 126 ADDQ R11, R8 \ 127 ADCQ R12, R9 \ 128 ADCQ R13,R10 \ 129 SHLQ $1,R10 \ 130 BTRQ $63, R9 \ 131 ADCQ R10, R8 \ 132 ADCQ $0, R9 \ 133 MOVQ R14, 0+c \ 134 MOVQ R15, 8+c \ 135 MOVQ R8,16+c \ 136 MOVQ R9,24+c 137 138 #define _fqSqrBmi2(c,a) \ 139 \ // t0 = R9:R8 = a0 + a1, R14:CX = a1 140 MOVQ 0+a, R10 \ 141 MOVQ 16+a, R14 \ 142 SUBQ R14, R10 \ 143 MOVQ 8+a, R11 \ 144 MOVQ 24+a, CX \ 145 SBBQ CX, R11 \ 146 \ 147 BTRQ $63, R11 \ 148 SBBQ $0, R10 \ 149 \ 150 \ // t1 = R11:R10 = a0 - a1 151 MOVQ R10, DX \ 152 MOVQ 0+a, R8 \ 153 ADDQ R14, R8 \ 154 MOVQ 8+a, R9 \ 155 ADCQ CX, R9 \ 156 \ 157 \ // c0 = t0 * t1 = (a0 + a1)*(a0 - a1), CX:R14:R13:R12 <- R9:R8 * R11:R10 158 MULXQ R8, R12, R13 \ 159 SBBQ $0, R11 \ 160 MULXQ R9, R14, AX \ 161 MOVQ R11, DX \ 162 ADDQ R14, R13 \ 163 MULXQ R9, R14, CX \ 164 MOVQ 8+a, R9 \ 165 ADCQ AX, R14 \ 166 ADCQ $0, CX \ 167 MULXQ R8, DX, AX \ 168 MOVQ 0+a, R8 \ 169 ADDQ DX, R13 \ 170 ADCQ AX, R14 \ 171 ADCQ $0, CX \ 172 \ 173 \ // t2 = R9:R8 = 2*a0 174 ADDQ R8, R8 \ 175 ADCQ R9, R9 \ 176 \ 177 \ // Reducing and storing c0 178 SHLQ $1, R14, CX \ 179 SHLQ $1, R13, R14 \ 180 BTRQ $63, R13 \ 181 BTRQ $63, CX \ 182 ADCQ R14, R12 \ 183 ADCQ CX, R13 \ 184 BTRQ $63, R13 \ 185 ADCQ $0, R12 \ 186 ADCQ $0, R13 \ 187 MOVQ R12, 0+c \ 188 MOVQ R13, 8+c \ 189 \ 190 \ // c1 = 2a0 * a1, CX:R14:R11:R10 <- R9:R8 * 16+ra:24+ra 191 MOVQ 16+a, DX \ 192 MULXQ R8, R10, R11 \ 193 MULXQ R9, R14, AX \ 194 ADDQ R14, R11 \ 195 MOVQ 24+a, DX \ 196 MULXQ R9, R14, CX \ 197 ADCQ AX, R14 \ 198 ADCQ $0, CX \ 199 MULXQ R8, DX, AX \ 200 ADDQ DX, R11 \ 201 ADCQ AX, R14 \ 202 ADCQ $0, CX \ 203 \ 204 \ // Reduce and store c1 205 SHLQ $1, R14, CX \ 206 SHLQ $1, R11, R14 \ 207 BTRQ $63, R11 \ 208 BTRQ $63, CX \ 209 ADCQ R14, R10 \ 210 ADCQ CX, R11 \ 211 BTRQ $63, R11 \ 212 ADCQ $0, R10 \ 213 ADCQ $0, R11 \ 214 MOVQ R10, 16+c \ 215 MOVQ R11, 24+c 216 217 #define _fqSqrLeg(c,a) _fqMulLeg(c,a,a)