github.com/consensys/gnark-crypto@v0.14.0/ecc/bls12-381/fr/element_mul_amd64.s (about) 1 // +build !purego 2 3 // Copyright 2020 ConsenSys Software Inc. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 #include "textflag.h" 18 #include "funcdata.h" 19 20 // modulus q 21 DATA q<>+0(SB)/8, $0xffffffff00000001 22 DATA q<>+8(SB)/8, $0x53bda402fffe5bfe 23 DATA q<>+16(SB)/8, $0x3339d80809a1d805 24 DATA q<>+24(SB)/8, $0x73eda753299d7d48 25 GLOBL q<>(SB), (RODATA+NOPTR), $32 26 27 // qInv0 q'[0] 28 DATA qInv0<>(SB)/8, $0xfffffffeffffffff 29 GLOBL qInv0<>(SB), (RODATA+NOPTR), $8 30 31 #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \ 32 MOVQ ra0, rb0; \ 33 SUBQ q<>(SB), ra0; \ 34 MOVQ ra1, rb1; \ 35 SBBQ q<>+8(SB), ra1; \ 36 MOVQ ra2, rb2; \ 37 SBBQ q<>+16(SB), ra2; \ 38 MOVQ ra3, rb3; \ 39 SBBQ q<>+24(SB), ra3; \ 40 CMOVQCS rb0, ra0; \ 41 CMOVQCS rb1, ra1; \ 42 CMOVQCS rb2, ra2; \ 43 CMOVQCS rb3, ra3; \ 44 45 // mul(res, x, y *Element) 46 TEXT ·mul(SB), $24-24 47 48 // the algorithm is described in the Element.Mul declaration (.go) 49 // however, to benefit from the ADCX and ADOX carry chains 50 // we split the inner loops in 2: 51 // for i=0 to N-1 52 // for j=0 to N-1 53 // (A,t[j]) := t[j] + x[j]*y[i] + A 54 // m := t[0]*q'[0] mod W 55 // C,_ := t[0] + m*q[0] 56 // for j=1 to N-1 57 // (C,t[j-1]) := t[j] + m*q[j] + C 58 // t[N-1] = C + A 59 60 NO_LOCAL_POINTERS 61 CMPB ·supportAdx(SB), $1 62 JNE l1 63 MOVQ x+8(FP), SI 64 65 // x[0] -> DI 66 // x[1] -> R8 67 // x[2] -> R9 68 // x[3] -> R10 69 MOVQ 0(SI), DI 70 MOVQ 8(SI), R8 71 MOVQ 16(SI), R9 72 MOVQ 24(SI), R10 73 MOVQ y+16(FP), R11 74 75 // A -> BP 76 // t[0] -> R14 77 // t[1] -> R13 78 // t[2] -> CX 79 // t[3] -> BX 80 // clear the flags 81 XORQ AX, AX 82 MOVQ 0(R11), DX 83 84 // (A,t[0]) := x[0]*y[0] + A 85 MULXQ DI, R14, R13 86 87 // (A,t[1]) := x[1]*y[0] + A 88 MULXQ R8, AX, CX 89 ADOXQ AX, R13 90 91 // (A,t[2]) := x[2]*y[0] + A 92 MULXQ R9, AX, BX 93 ADOXQ AX, CX 94 95 // (A,t[3]) := x[3]*y[0] + A 96 MULXQ R10, AX, BP 97 ADOXQ AX, BX 98 99 // A += carries from ADCXQ and ADOXQ 100 MOVQ $0, AX 101 ADOXQ AX, BP 102 103 // m := t[0]*q'[0] mod W 104 MOVQ qInv0<>(SB), DX 105 IMULQ R14, DX 106 107 // clear the flags 108 XORQ AX, AX 109 110 // C,_ := t[0] + m*q[0] 111 MULXQ q<>+0(SB), AX, R12 112 ADCXQ R14, AX 113 MOVQ R12, R14 114 115 // (C,t[0]) := t[1] + m*q[1] + C 116 ADCXQ R13, R14 117 MULXQ q<>+8(SB), AX, R13 118 ADOXQ AX, R14 119 120 // (C,t[1]) := t[2] + m*q[2] + C 121 ADCXQ CX, R13 122 MULXQ q<>+16(SB), AX, CX 123 ADOXQ AX, R13 124 125 // (C,t[2]) := t[3] + m*q[3] + C 126 ADCXQ BX, CX 127 MULXQ q<>+24(SB), AX, BX 128 ADOXQ AX, CX 129 130 // t[3] = C + A 131 MOVQ $0, AX 132 ADCXQ AX, BX 133 ADOXQ BP, BX 134 135 // clear the flags 136 XORQ AX, AX 137 MOVQ 8(R11), DX 138 139 // (A,t[0]) := t[0] + x[0]*y[1] + A 140 MULXQ DI, AX, BP 141 ADOXQ AX, R14 142 143 // (A,t[1]) := t[1] + x[1]*y[1] + A 144 ADCXQ BP, R13 145 MULXQ R8, AX, BP 146 ADOXQ AX, R13 147 148 // (A,t[2]) := t[2] + x[2]*y[1] + A 149 ADCXQ BP, CX 150 MULXQ R9, AX, BP 151 ADOXQ AX, CX 152 153 // (A,t[3]) := t[3] + x[3]*y[1] + A 154 ADCXQ BP, BX 155 MULXQ R10, AX, BP 156 ADOXQ AX, BX 157 158 // A += carries from ADCXQ and ADOXQ 159 MOVQ $0, AX 160 ADCXQ AX, BP 161 ADOXQ AX, BP 162 163 // m := t[0]*q'[0] mod W 164 MOVQ qInv0<>(SB), DX 165 IMULQ R14, DX 166 167 // clear the flags 168 XORQ AX, AX 169 170 // C,_ := t[0] + m*q[0] 171 MULXQ q<>+0(SB), AX, R12 172 ADCXQ R14, AX 173 MOVQ R12, R14 174 175 // (C,t[0]) := t[1] + m*q[1] + C 176 ADCXQ R13, R14 177 MULXQ q<>+8(SB), AX, R13 178 ADOXQ AX, R14 179 180 // (C,t[1]) := t[2] + m*q[2] + C 181 ADCXQ CX, R13 182 MULXQ q<>+16(SB), AX, CX 183 ADOXQ AX, R13 184 185 // (C,t[2]) := t[3] + m*q[3] + C 186 ADCXQ BX, CX 187 MULXQ q<>+24(SB), AX, BX 188 ADOXQ AX, CX 189 190 // t[3] = C + A 191 MOVQ $0, AX 192 ADCXQ AX, BX 193 ADOXQ BP, BX 194 195 // clear the flags 196 XORQ AX, AX 197 MOVQ 16(R11), DX 198 199 // (A,t[0]) := t[0] + x[0]*y[2] + A 200 MULXQ DI, AX, BP 201 ADOXQ AX, R14 202 203 // (A,t[1]) := t[1] + x[1]*y[2] + A 204 ADCXQ BP, R13 205 MULXQ R8, AX, BP 206 ADOXQ AX, R13 207 208 // (A,t[2]) := t[2] + x[2]*y[2] + A 209 ADCXQ BP, CX 210 MULXQ R9, AX, BP 211 ADOXQ AX, CX 212 213 // (A,t[3]) := t[3] + x[3]*y[2] + A 214 ADCXQ BP, BX 215 MULXQ R10, AX, BP 216 ADOXQ AX, BX 217 218 // A += carries from ADCXQ and ADOXQ 219 MOVQ $0, AX 220 ADCXQ AX, BP 221 ADOXQ AX, BP 222 223 // m := t[0]*q'[0] mod W 224 MOVQ qInv0<>(SB), DX 225 IMULQ R14, DX 226 227 // clear the flags 228 XORQ AX, AX 229 230 // C,_ := t[0] + m*q[0] 231 MULXQ q<>+0(SB), AX, R12 232 ADCXQ R14, AX 233 MOVQ R12, R14 234 235 // (C,t[0]) := t[1] + m*q[1] + C 236 ADCXQ R13, R14 237 MULXQ q<>+8(SB), AX, R13 238 ADOXQ AX, R14 239 240 // (C,t[1]) := t[2] + m*q[2] + C 241 ADCXQ CX, R13 242 MULXQ q<>+16(SB), AX, CX 243 ADOXQ AX, R13 244 245 // (C,t[2]) := t[3] + m*q[3] + C 246 ADCXQ BX, CX 247 MULXQ q<>+24(SB), AX, BX 248 ADOXQ AX, CX 249 250 // t[3] = C + A 251 MOVQ $0, AX 252 ADCXQ AX, BX 253 ADOXQ BP, BX 254 255 // clear the flags 256 XORQ AX, AX 257 MOVQ 24(R11), DX 258 259 // (A,t[0]) := t[0] + x[0]*y[3] + A 260 MULXQ DI, AX, BP 261 ADOXQ AX, R14 262 263 // (A,t[1]) := t[1] + x[1]*y[3] + A 264 ADCXQ BP, R13 265 MULXQ R8, AX, BP 266 ADOXQ AX, R13 267 268 // (A,t[2]) := t[2] + x[2]*y[3] + A 269 ADCXQ BP, CX 270 MULXQ R9, AX, BP 271 ADOXQ AX, CX 272 273 // (A,t[3]) := t[3] + x[3]*y[3] + A 274 ADCXQ BP, BX 275 MULXQ R10, AX, BP 276 ADOXQ AX, BX 277 278 // A += carries from ADCXQ and ADOXQ 279 MOVQ $0, AX 280 ADCXQ AX, BP 281 ADOXQ AX, BP 282 283 // m := t[0]*q'[0] mod W 284 MOVQ qInv0<>(SB), DX 285 IMULQ R14, DX 286 287 // clear the flags 288 XORQ AX, AX 289 290 // C,_ := t[0] + m*q[0] 291 MULXQ q<>+0(SB), AX, R12 292 ADCXQ R14, AX 293 MOVQ R12, R14 294 295 // (C,t[0]) := t[1] + m*q[1] + C 296 ADCXQ R13, R14 297 MULXQ q<>+8(SB), AX, R13 298 ADOXQ AX, R14 299 300 // (C,t[1]) := t[2] + m*q[2] + C 301 ADCXQ CX, R13 302 MULXQ q<>+16(SB), AX, CX 303 ADOXQ AX, R13 304 305 // (C,t[2]) := t[3] + m*q[3] + C 306 ADCXQ BX, CX 307 MULXQ q<>+24(SB), AX, BX 308 ADOXQ AX, CX 309 310 // t[3] = C + A 311 MOVQ $0, AX 312 ADCXQ AX, BX 313 ADOXQ BP, BX 314 315 // reduce element(R14,R13,CX,BX) using temp registers (SI,R12,R11,DI) 316 REDUCE(R14,R13,CX,BX,SI,R12,R11,DI) 317 318 MOVQ res+0(FP), AX 319 MOVQ R14, 0(AX) 320 MOVQ R13, 8(AX) 321 MOVQ CX, 16(AX) 322 MOVQ BX, 24(AX) 323 RET 324 325 l1: 326 MOVQ res+0(FP), AX 327 MOVQ AX, (SP) 328 MOVQ x+8(FP), AX 329 MOVQ AX, 8(SP) 330 MOVQ y+16(FP), AX 331 MOVQ AX, 16(SP) 332 CALL ·_mulGeneric(SB) 333 RET 334 335 TEXT ·fromMont(SB), $8-8 336 NO_LOCAL_POINTERS 337 338 // the algorithm is described here 339 // https://hackmd.io/@gnark/modular_multiplication 340 // when y = 1 we have: 341 // for i=0 to N-1 342 // t[i] = x[i] 343 // for i=0 to N-1 344 // m := t[0]*q'[0] mod W 345 // C,_ := t[0] + m*q[0] 346 // for j=1 to N-1 347 // (C,t[j-1]) := t[j] + m*q[j] + C 348 // t[N-1] = C 349 CMPB ·supportAdx(SB), $1 350 JNE l2 351 MOVQ res+0(FP), DX 352 MOVQ 0(DX), R14 353 MOVQ 8(DX), R13 354 MOVQ 16(DX), CX 355 MOVQ 24(DX), BX 356 XORQ DX, DX 357 358 // m := t[0]*q'[0] mod W 359 MOVQ qInv0<>(SB), DX 360 IMULQ R14, DX 361 XORQ AX, AX 362 363 // C,_ := t[0] + m*q[0] 364 MULXQ q<>+0(SB), AX, BP 365 ADCXQ R14, AX 366 MOVQ BP, R14 367 368 // (C,t[0]) := t[1] + m*q[1] + C 369 ADCXQ R13, R14 370 MULXQ q<>+8(SB), AX, R13 371 ADOXQ AX, R14 372 373 // (C,t[1]) := t[2] + m*q[2] + C 374 ADCXQ CX, R13 375 MULXQ q<>+16(SB), AX, CX 376 ADOXQ AX, R13 377 378 // (C,t[2]) := t[3] + m*q[3] + C 379 ADCXQ BX, CX 380 MULXQ q<>+24(SB), AX, BX 381 ADOXQ AX, CX 382 MOVQ $0, AX 383 ADCXQ AX, BX 384 ADOXQ AX, BX 385 XORQ DX, DX 386 387 // m := t[0]*q'[0] mod W 388 MOVQ qInv0<>(SB), DX 389 IMULQ R14, DX 390 XORQ AX, AX 391 392 // C,_ := t[0] + m*q[0] 393 MULXQ q<>+0(SB), AX, BP 394 ADCXQ R14, AX 395 MOVQ BP, R14 396 397 // (C,t[0]) := t[1] + m*q[1] + C 398 ADCXQ R13, R14 399 MULXQ q<>+8(SB), AX, R13 400 ADOXQ AX, R14 401 402 // (C,t[1]) := t[2] + m*q[2] + C 403 ADCXQ CX, R13 404 MULXQ q<>+16(SB), AX, CX 405 ADOXQ AX, R13 406 407 // (C,t[2]) := t[3] + m*q[3] + C 408 ADCXQ BX, CX 409 MULXQ q<>+24(SB), AX, BX 410 ADOXQ AX, CX 411 MOVQ $0, AX 412 ADCXQ AX, BX 413 ADOXQ AX, BX 414 XORQ DX, DX 415 416 // m := t[0]*q'[0] mod W 417 MOVQ qInv0<>(SB), DX 418 IMULQ R14, DX 419 XORQ AX, AX 420 421 // C,_ := t[0] + m*q[0] 422 MULXQ q<>+0(SB), AX, BP 423 ADCXQ R14, AX 424 MOVQ BP, R14 425 426 // (C,t[0]) := t[1] + m*q[1] + C 427 ADCXQ R13, R14 428 MULXQ q<>+8(SB), AX, R13 429 ADOXQ AX, R14 430 431 // (C,t[1]) := t[2] + m*q[2] + C 432 ADCXQ CX, R13 433 MULXQ q<>+16(SB), AX, CX 434 ADOXQ AX, R13 435 436 // (C,t[2]) := t[3] + m*q[3] + C 437 ADCXQ BX, CX 438 MULXQ q<>+24(SB), AX, BX 439 ADOXQ AX, CX 440 MOVQ $0, AX 441 ADCXQ AX, BX 442 ADOXQ AX, BX 443 XORQ DX, DX 444 445 // m := t[0]*q'[0] mod W 446 MOVQ qInv0<>(SB), DX 447 IMULQ R14, DX 448 XORQ AX, AX 449 450 // C,_ := t[0] + m*q[0] 451 MULXQ q<>+0(SB), AX, BP 452 ADCXQ R14, AX 453 MOVQ BP, R14 454 455 // (C,t[0]) := t[1] + m*q[1] + C 456 ADCXQ R13, R14 457 MULXQ q<>+8(SB), AX, R13 458 ADOXQ AX, R14 459 460 // (C,t[1]) := t[2] + m*q[2] + C 461 ADCXQ CX, R13 462 MULXQ q<>+16(SB), AX, CX 463 ADOXQ AX, R13 464 465 // (C,t[2]) := t[3] + m*q[3] + C 466 ADCXQ BX, CX 467 MULXQ q<>+24(SB), AX, BX 468 ADOXQ AX, CX 469 MOVQ $0, AX 470 ADCXQ AX, BX 471 ADOXQ AX, BX 472 473 // reduce element(R14,R13,CX,BX) using temp registers (SI,DI,R8,R9) 474 REDUCE(R14,R13,CX,BX,SI,DI,R8,R9) 475 476 MOVQ res+0(FP), AX 477 MOVQ R14, 0(AX) 478 MOVQ R13, 8(AX) 479 MOVQ CX, 16(AX) 480 MOVQ BX, 24(AX) 481 RET 482 483 l2: 484 MOVQ res+0(FP), AX 485 MOVQ AX, (SP) 486 CALL ·_fromMontGeneric(SB) 487 RET