github.com/gnolang/gno@v0.0.0-20240520182011-228e9d0192ce/examples/gno.land/p/demo/uint256/mod.gno (about) 1 package uint256 2 3 import ( 4 "math/bits" 5 ) 6 7 // Some utility functions 8 9 // Reciprocal computes a 320-bit value representing 1/m 10 // 11 // Notes: 12 // - specialized for m.arr[3] != 0, hence limited to 2^192 <= m < 2^256 13 // - returns zero if m.arr[3] == 0 14 // - starts with a 32-bit division, refines with newton-raphson iterations 15 func Reciprocal(m *Uint) (mu [5]uint64) { 16 if m.arr[3] == 0 { 17 return mu 18 } 19 20 s := bits.LeadingZeros64(m.arr[3]) // Replace with leadingZeros(m) for general case 21 p := 255 - s // floor(log_2(m)), m>0 22 23 // 0 or a power of 2? 24 25 // Check if at least one bit is set in m.arr[2], m.arr[1] or m.arr[0], 26 // or at least two bits in m.arr[3] 27 28 if m.arr[0]|m.arr[1]|m.arr[2]|(m.arr[3]&(m.arr[3]-1)) == 0 { 29 30 mu[4] = ^uint64(0) >> uint(p&63) 31 mu[3] = ^uint64(0) 32 mu[2] = ^uint64(0) 33 mu[1] = ^uint64(0) 34 mu[0] = ^uint64(0) 35 36 return mu 37 } 38 39 // Maximise division precision by left-aligning divisor 40 41 var ( 42 y Uint // left-aligned copy of m 43 r0 uint32 // estimate of 2^31/y 44 ) 45 46 y.Lsh(m, uint(s)) // 1/2 < y < 1 47 48 // Extract most significant 32 bits 49 50 yh := uint32(y.arr[3] >> 32) 51 52 if yh == 0x80000000 { // Avoid overflow in division 53 r0 = 0xffffffff 54 } else { 55 r0, _ = bits.Div32(0x80000000, 0, yh) 56 } 57 58 // First iteration: 32 -> 64 59 60 t1 := uint64(r0) // 2^31/y 61 t1 *= t1 // 2^62/y^2 62 t1, _ = bits.Mul64(t1, y.arr[3]) // 2^62/y^2 * 2^64/y / 2^64 = 2^62/y 63 64 r1 := uint64(r0) << 32 // 2^63/y 65 r1 -= t1 // 2^63/y - 2^62/y = 2^62/y 66 r1 *= 2 // 2^63/y 67 68 if (r1 | (y.arr[3] << 1)) == 0 { 69 r1 = ^uint64(0) 70 } 71 72 // Second iteration: 64 -> 128 73 74 // square: 2^126/y^2 75 a2h, a2l := bits.Mul64(r1, r1) 76 77 // multiply by y: e2h:e2l:b2h = 2^126/y^2 * 2^128/y / 2^128 = 2^126/y 78 b2h, _ := bits.Mul64(a2l, y.arr[2]) 79 c2h, c2l := bits.Mul64(a2l, y.arr[3]) 80 d2h, d2l := bits.Mul64(a2h, y.arr[2]) 81 e2h, e2l := bits.Mul64(a2h, y.arr[3]) 82 83 b2h, c := bits.Add64(b2h, c2l, 0) 84 e2l, c = bits.Add64(e2l, c2h, c) 85 e2h, _ = bits.Add64(e2h, 0, c) 86 87 _, c = bits.Add64(b2h, d2l, 0) 88 e2l, c = bits.Add64(e2l, d2h, c) 89 e2h, _ = bits.Add64(e2h, 0, c) 90 91 // subtract: t2h:t2l = 2^127/y - 2^126/y = 2^126/y 92 t2l, b := bits.Sub64(0, e2l, 0) 93 t2h, _ := bits.Sub64(r1, e2h, b) 94 95 // double: r2h:r2l = 2^127/y 96 r2l, c := bits.Add64(t2l, t2l, 0) 97 r2h, _ := bits.Add64(t2h, t2h, c) 98 99 if (r2h | r2l | (y.arr[3] << 1)) == 0 { 100 r2h = ^uint64(0) 101 r2l = ^uint64(0) 102 } 103 104 // Third iteration: 128 -> 192 105 106 // square r2 (keep 256 bits): 2^190/y^2 107 a3h, a3l := bits.Mul64(r2l, r2l) 108 b3h, b3l := bits.Mul64(r2l, r2h) 109 c3h, c3l := bits.Mul64(r2h, r2h) 110 111 a3h, c = bits.Add64(a3h, b3l, 0) 112 c3l, c = bits.Add64(c3l, b3h, c) 113 c3h, _ = bits.Add64(c3h, 0, c) 114 115 a3h, c = bits.Add64(a3h, b3l, 0) 116 c3l, c = bits.Add64(c3l, b3h, c) 117 c3h, _ = bits.Add64(c3h, 0, c) 118 119 // multiply by y: q = 2^190/y^2 * 2^192/y / 2^192 = 2^190/y 120 121 x0 := a3l 122 x1 := a3h 123 x2 := c3l 124 x3 := c3h 125 126 var q0, q1, q2, q3, q4, t0 uint64 127 128 q0, _ = bits.Mul64(x2, y.arr[0]) 129 q1, t0 = bits.Mul64(x3, y.arr[0]) 130 q0, c = bits.Add64(q0, t0, 0) 131 q1, _ = bits.Add64(q1, 0, c) 132 133 t1, _ = bits.Mul64(x1, y.arr[1]) 134 q0, c = bits.Add64(q0, t1, 0) 135 q2, t0 = bits.Mul64(x3, y.arr[1]) 136 q1, c = bits.Add64(q1, t0, c) 137 q2, _ = bits.Add64(q2, 0, c) 138 139 t1, t0 = bits.Mul64(x2, y.arr[1]) 140 q0, c = bits.Add64(q0, t0, 0) 141 q1, c = bits.Add64(q1, t1, c) 142 q2, _ = bits.Add64(q2, 0, c) 143 144 t1, t0 = bits.Mul64(x1, y.arr[2]) 145 q0, c = bits.Add64(q0, t0, 0) 146 q1, c = bits.Add64(q1, t1, c) 147 q3, t0 = bits.Mul64(x3, y.arr[2]) 148 q2, c = bits.Add64(q2, t0, c) 149 q3, _ = bits.Add64(q3, 0, c) 150 151 t1, _ = bits.Mul64(x0, y.arr[2]) 152 q0, c = bits.Add64(q0, t1, 0) 153 t1, t0 = bits.Mul64(x2, y.arr[2]) 154 q1, c = bits.Add64(q1, t0, c) 155 q2, c = bits.Add64(q2, t1, c) 156 q3, _ = bits.Add64(q3, 0, c) 157 158 t1, t0 = bits.Mul64(x1, y.arr[3]) 159 q1, c = bits.Add64(q1, t0, 0) 160 q2, c = bits.Add64(q2, t1, c) 161 q4, t0 = bits.Mul64(x3, y.arr[3]) 162 q3, c = bits.Add64(q3, t0, c) 163 q4, _ = bits.Add64(q4, 0, c) 164 165 t1, t0 = bits.Mul64(x0, y.arr[3]) 166 q0, c = bits.Add64(q0, t0, 0) 167 q1, c = bits.Add64(q1, t1, c) 168 t1, t0 = bits.Mul64(x2, y.arr[3]) 169 q2, c = bits.Add64(q2, t0, c) 170 q3, c = bits.Add64(q3, t1, c) 171 q4, _ = bits.Add64(q4, 0, c) 172 173 // subtract: t3 = 2^191/y - 2^190/y = 2^190/y 174 _, b = bits.Sub64(0, q0, 0) 175 _, b = bits.Sub64(0, q1, b) 176 t3l, b := bits.Sub64(0, q2, b) 177 t3m, b := bits.Sub64(r2l, q3, b) 178 t3h, _ := bits.Sub64(r2h, q4, b) 179 180 // double: r3 = 2^191/y 181 r3l, c := bits.Add64(t3l, t3l, 0) 182 r3m, c := bits.Add64(t3m, t3m, c) 183 r3h, _ := bits.Add64(t3h, t3h, c) 184 185 // Fourth iteration: 192 -> 320 186 187 // square r3 188 189 a4h, a4l := bits.Mul64(r3l, r3l) 190 b4h, b4l := bits.Mul64(r3l, r3m) 191 c4h, c4l := bits.Mul64(r3l, r3h) 192 d4h, d4l := bits.Mul64(r3m, r3m) 193 e4h, e4l := bits.Mul64(r3m, r3h) 194 f4h, f4l := bits.Mul64(r3h, r3h) 195 196 b4h, c = bits.Add64(b4h, c4l, 0) 197 e4l, c = bits.Add64(e4l, c4h, c) 198 e4h, _ = bits.Add64(e4h, 0, c) 199 200 a4h, c = bits.Add64(a4h, b4l, 0) 201 d4l, c = bits.Add64(d4l, b4h, c) 202 d4h, c = bits.Add64(d4h, e4l, c) 203 f4l, c = bits.Add64(f4l, e4h, c) 204 f4h, _ = bits.Add64(f4h, 0, c) 205 206 a4h, c = bits.Add64(a4h, b4l, 0) 207 d4l, c = bits.Add64(d4l, b4h, c) 208 d4h, c = bits.Add64(d4h, e4l, c) 209 f4l, c = bits.Add64(f4l, e4h, c) 210 f4h, _ = bits.Add64(f4h, 0, c) 211 212 // multiply by y 213 214 x1, x0 = bits.Mul64(d4h, y.arr[0]) 215 x3, x2 = bits.Mul64(f4h, y.arr[0]) 216 t1, t0 = bits.Mul64(f4l, y.arr[0]) 217 x1, c = bits.Add64(x1, t0, 0) 218 x2, c = bits.Add64(x2, t1, c) 219 x3, _ = bits.Add64(x3, 0, c) 220 221 t1, t0 = bits.Mul64(d4h, y.arr[1]) 222 x1, c = bits.Add64(x1, t0, 0) 223 x2, c = bits.Add64(x2, t1, c) 224 x4, t0 := bits.Mul64(f4h, y.arr[1]) 225 x3, c = bits.Add64(x3, t0, c) 226 x4, _ = bits.Add64(x4, 0, c) 227 t1, t0 = bits.Mul64(d4l, y.arr[1]) 228 x0, c = bits.Add64(x0, t0, 0) 229 x1, c = bits.Add64(x1, t1, c) 230 t1, t0 = bits.Mul64(f4l, y.arr[1]) 231 x2, c = bits.Add64(x2, t0, c) 232 x3, c = bits.Add64(x3, t1, c) 233 x4, _ = bits.Add64(x4, 0, c) 234 235 t1, t0 = bits.Mul64(a4h, y.arr[2]) 236 x0, c = bits.Add64(x0, t0, 0) 237 x1, c = bits.Add64(x1, t1, c) 238 t1, t0 = bits.Mul64(d4h, y.arr[2]) 239 x2, c = bits.Add64(x2, t0, c) 240 x3, c = bits.Add64(x3, t1, c) 241 x5, t0 := bits.Mul64(f4h, y.arr[2]) 242 x4, c = bits.Add64(x4, t0, c) 243 x5, _ = bits.Add64(x5, 0, c) 244 t1, t0 = bits.Mul64(d4l, y.arr[2]) 245 x1, c = bits.Add64(x1, t0, 0) 246 x2, c = bits.Add64(x2, t1, c) 247 t1, t0 = bits.Mul64(f4l, y.arr[2]) 248 x3, c = bits.Add64(x3, t0, c) 249 x4, c = bits.Add64(x4, t1, c) 250 x5, _ = bits.Add64(x5, 0, c) 251 252 t1, t0 = bits.Mul64(a4h, y.arr[3]) 253 x1, c = bits.Add64(x1, t0, 0) 254 x2, c = bits.Add64(x2, t1, c) 255 t1, t0 = bits.Mul64(d4h, y.arr[3]) 256 x3, c = bits.Add64(x3, t0, c) 257 x4, c = bits.Add64(x4, t1, c) 258 x6, t0 := bits.Mul64(f4h, y.arr[3]) 259 x5, c = bits.Add64(x5, t0, c) 260 x6, _ = bits.Add64(x6, 0, c) 261 t1, t0 = bits.Mul64(a4l, y.arr[3]) 262 x0, c = bits.Add64(x0, t0, 0) 263 x1, c = bits.Add64(x1, t1, c) 264 t1, t0 = bits.Mul64(d4l, y.arr[3]) 265 x2, c = bits.Add64(x2, t0, c) 266 x3, c = bits.Add64(x3, t1, c) 267 t1, t0 = bits.Mul64(f4l, y.arr[3]) 268 x4, c = bits.Add64(x4, t0, c) 269 x5, c = bits.Add64(x5, t1, c) 270 x6, _ = bits.Add64(x6, 0, c) 271 272 // subtract 273 _, b = bits.Sub64(0, x0, 0) 274 _, b = bits.Sub64(0, x1, b) 275 r4l, b := bits.Sub64(0, x2, b) 276 r4k, b := bits.Sub64(0, x3, b) 277 r4j, b := bits.Sub64(r3l, x4, b) 278 r4i, b := bits.Sub64(r3m, x5, b) 279 r4h, _ := bits.Sub64(r3h, x6, b) 280 281 // Multiply candidate for 1/4y by y, with full precision 282 283 x0 = r4l 284 x1 = r4k 285 x2 = r4j 286 x3 = r4i 287 x4 = r4h 288 289 q1, q0 = bits.Mul64(x0, y.arr[0]) 290 q3, q2 = bits.Mul64(x2, y.arr[0]) 291 q5, q4 := bits.Mul64(x4, y.arr[0]) 292 293 t1, t0 = bits.Mul64(x1, y.arr[0]) 294 q1, c = bits.Add64(q1, t0, 0) 295 q2, c = bits.Add64(q2, t1, c) 296 t1, t0 = bits.Mul64(x3, y.arr[0]) 297 q3, c = bits.Add64(q3, t0, c) 298 q4, c = bits.Add64(q4, t1, c) 299 q5, _ = bits.Add64(q5, 0, c) 300 301 t1, t0 = bits.Mul64(x0, y.arr[1]) 302 q1, c = bits.Add64(q1, t0, 0) 303 q2, c = bits.Add64(q2, t1, c) 304 t1, t0 = bits.Mul64(x2, y.arr[1]) 305 q3, c = bits.Add64(q3, t0, c) 306 q4, c = bits.Add64(q4, t1, c) 307 q6, t0 := bits.Mul64(x4, y.arr[1]) 308 q5, c = bits.Add64(q5, t0, c) 309 q6, _ = bits.Add64(q6, 0, c) 310 311 t1, t0 = bits.Mul64(x1, y.arr[1]) 312 q2, c = bits.Add64(q2, t0, 0) 313 q3, c = bits.Add64(q3, t1, c) 314 t1, t0 = bits.Mul64(x3, y.arr[1]) 315 q4, c = bits.Add64(q4, t0, c) 316 q5, c = bits.Add64(q5, t1, c) 317 q6, _ = bits.Add64(q6, 0, c) 318 319 t1, t0 = bits.Mul64(x0, y.arr[2]) 320 q2, c = bits.Add64(q2, t0, 0) 321 q3, c = bits.Add64(q3, t1, c) 322 t1, t0 = bits.Mul64(x2, y.arr[2]) 323 q4, c = bits.Add64(q4, t0, c) 324 q5, c = bits.Add64(q5, t1, c) 325 q7, t0 := bits.Mul64(x4, y.arr[2]) 326 q6, c = bits.Add64(q6, t0, c) 327 q7, _ = bits.Add64(q7, 0, c) 328 329 t1, t0 = bits.Mul64(x1, y.arr[2]) 330 q3, c = bits.Add64(q3, t0, 0) 331 q4, c = bits.Add64(q4, t1, c) 332 t1, t0 = bits.Mul64(x3, y.arr[2]) 333 q5, c = bits.Add64(q5, t0, c) 334 q6, c = bits.Add64(q6, t1, c) 335 q7, _ = bits.Add64(q7, 0, c) 336 337 t1, t0 = bits.Mul64(x0, y.arr[3]) 338 q3, c = bits.Add64(q3, t0, 0) 339 q4, c = bits.Add64(q4, t1, c) 340 t1, t0 = bits.Mul64(x2, y.arr[3]) 341 q5, c = bits.Add64(q5, t0, c) 342 q6, c = bits.Add64(q6, t1, c) 343 q8, t0 := bits.Mul64(x4, y.arr[3]) 344 q7, c = bits.Add64(q7, t0, c) 345 q8, _ = bits.Add64(q8, 0, c) 346 347 t1, t0 = bits.Mul64(x1, y.arr[3]) 348 q4, c = bits.Add64(q4, t0, 0) 349 q5, c = bits.Add64(q5, t1, c) 350 t1, t0 = bits.Mul64(x3, y.arr[3]) 351 q6, c = bits.Add64(q6, t0, c) 352 q7, c = bits.Add64(q7, t1, c) 353 q8, _ = bits.Add64(q8, 0, c) 354 355 // Final adjustment 356 357 // subtract q from 1/4 358 _, b = bits.Sub64(0, q0, 0) 359 _, b = bits.Sub64(0, q1, b) 360 _, b = bits.Sub64(0, q2, b) 361 _, b = bits.Sub64(0, q3, b) 362 _, b = bits.Sub64(0, q4, b) 363 _, b = bits.Sub64(0, q5, b) 364 _, b = bits.Sub64(0, q6, b) 365 _, b = bits.Sub64(0, q7, b) 366 _, b = bits.Sub64(uint64(1)<<62, q8, b) 367 368 // decrement the result 369 x0, t := bits.Sub64(r4l, 1, 0) 370 x1, t = bits.Sub64(r4k, 0, t) 371 x2, t = bits.Sub64(r4j, 0, t) 372 x3, t = bits.Sub64(r4i, 0, t) 373 x4, _ = bits.Sub64(r4h, 0, t) 374 375 // commit the decrement if the subtraction underflowed (reciprocal was too large) 376 if b != 0 { 377 r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0 378 } 379 380 // Shift to correct bit alignment, truncating excess bits 381 382 p = (p & 63) - 1 383 384 x0, c = bits.Add64(r4l, r4l, 0) 385 x1, c = bits.Add64(r4k, r4k, c) 386 x2, c = bits.Add64(r4j, r4j, c) 387 x3, c = bits.Add64(r4i, r4i, c) 388 x4, _ = bits.Add64(r4h, r4h, c) 389 390 if p < 0 { 391 r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0 392 p = 0 // avoid negative shift below 393 } 394 395 { 396 r := uint(p) // right shift 397 l := uint(64 - r) // left shift 398 399 x0 = (r4l >> r) | (r4k << l) 400 x1 = (r4k >> r) | (r4j << l) 401 x2 = (r4j >> r) | (r4i << l) 402 x3 = (r4i >> r) | (r4h << l) 403 x4 = (r4h >> r) 404 } 405 406 if p > 0 { 407 r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0 408 } 409 410 mu[0] = r4l 411 mu[1] = r4k 412 mu[2] = r4j 413 mu[3] = r4i 414 mu[4] = r4h 415 416 return mu 417 } 418 419 // reduce4 computes the least non-negative residue of x modulo m 420 // 421 // requires a four-word modulus (m.arr[3] > 1) and its inverse (mu) 422 func reduce4(x [8]uint64, m *Uint, mu [5]uint64) (z Uint) { 423 // NB: Most variable names in the comments match the pseudocode for 424 // Barrett reduction in the Handbook of Applied Cryptography. 425 426 // q1 = x/2^192 427 428 x0 := x[3] 429 x1 := x[4] 430 x2 := x[5] 431 x3 := x[6] 432 x4 := x[7] 433 434 // q2 = q1 * mu; q3 = q2 / 2^320 435 436 var q0, q1, q2, q3, q4, q5, t0, t1, c uint64 437 438 q0, _ = bits.Mul64(x3, mu[0]) 439 q1, t0 = bits.Mul64(x4, mu[0]) 440 q0, c = bits.Add64(q0, t0, 0) 441 q1, _ = bits.Add64(q1, 0, c) 442 443 t1, _ = bits.Mul64(x2, mu[1]) 444 q0, c = bits.Add64(q0, t1, 0) 445 q2, t0 = bits.Mul64(x4, mu[1]) 446 q1, c = bits.Add64(q1, t0, c) 447 q2, _ = bits.Add64(q2, 0, c) 448 449 t1, t0 = bits.Mul64(x3, mu[1]) 450 q0, c = bits.Add64(q0, t0, 0) 451 q1, c = bits.Add64(q1, t1, c) 452 q2, _ = bits.Add64(q2, 0, c) 453 454 t1, t0 = bits.Mul64(x2, mu[2]) 455 q0, c = bits.Add64(q0, t0, 0) 456 q1, c = bits.Add64(q1, t1, c) 457 q3, t0 = bits.Mul64(x4, mu[2]) 458 q2, c = bits.Add64(q2, t0, c) 459 q3, _ = bits.Add64(q3, 0, c) 460 461 t1, _ = bits.Mul64(x1, mu[2]) 462 q0, c = bits.Add64(q0, t1, 0) 463 t1, t0 = bits.Mul64(x3, mu[2]) 464 q1, c = bits.Add64(q1, t0, c) 465 q2, c = bits.Add64(q2, t1, c) 466 q3, _ = bits.Add64(q3, 0, c) 467 468 t1, _ = bits.Mul64(x0, mu[3]) 469 q0, c = bits.Add64(q0, t1, 0) 470 t1, t0 = bits.Mul64(x2, mu[3]) 471 q1, c = bits.Add64(q1, t0, c) 472 q2, c = bits.Add64(q2, t1, c) 473 q4, t0 = bits.Mul64(x4, mu[3]) 474 q3, c = bits.Add64(q3, t0, c) 475 q4, _ = bits.Add64(q4, 0, c) 476 477 t1, t0 = bits.Mul64(x1, mu[3]) 478 q0, c = bits.Add64(q0, t0, 0) 479 q1, c = bits.Add64(q1, t1, c) 480 t1, t0 = bits.Mul64(x3, mu[3]) 481 q2, c = bits.Add64(q2, t0, c) 482 q3, c = bits.Add64(q3, t1, c) 483 q4, _ = bits.Add64(q4, 0, c) 484 485 t1, t0 = bits.Mul64(x0, mu[4]) 486 _, c = bits.Add64(q0, t0, 0) 487 q1, c = bits.Add64(q1, t1, c) 488 t1, t0 = bits.Mul64(x2, mu[4]) 489 q2, c = bits.Add64(q2, t0, c) 490 q3, c = bits.Add64(q3, t1, c) 491 q5, t0 = bits.Mul64(x4, mu[4]) 492 q4, c = bits.Add64(q4, t0, c) 493 q5, _ = bits.Add64(q5, 0, c) 494 495 t1, t0 = bits.Mul64(x1, mu[4]) 496 q1, c = bits.Add64(q1, t0, 0) 497 q2, c = bits.Add64(q2, t1, c) 498 t1, t0 = bits.Mul64(x3, mu[4]) 499 q3, c = bits.Add64(q3, t0, c) 500 q4, c = bits.Add64(q4, t1, c) 501 q5, _ = bits.Add64(q5, 0, c) 502 503 // Drop the fractional part of q3 504 505 q0 = q1 506 q1 = q2 507 q2 = q3 508 q3 = q4 509 q4 = q5 510 511 // r1 = x mod 2^320 512 513 x0 = x[0] 514 x1 = x[1] 515 x2 = x[2] 516 x3 = x[3] 517 x4 = x[4] 518 519 // r2 = q3 * m mod 2^320 520 521 var r0, r1, r2, r3, r4 uint64 522 523 r4, r3 = bits.Mul64(q0, m.arr[3]) 524 _, t0 = bits.Mul64(q1, m.arr[3]) 525 r4, _ = bits.Add64(r4, t0, 0) 526 527 t1, r2 = bits.Mul64(q0, m.arr[2]) 528 r3, c = bits.Add64(r3, t1, 0) 529 _, t0 = bits.Mul64(q2, m.arr[2]) 530 r4, _ = bits.Add64(r4, t0, c) 531 532 t1, t0 = bits.Mul64(q1, m.arr[2]) 533 r3, c = bits.Add64(r3, t0, 0) 534 r4, _ = bits.Add64(r4, t1, c) 535 536 t1, r1 = bits.Mul64(q0, m.arr[1]) 537 r2, c = bits.Add64(r2, t1, 0) 538 t1, t0 = bits.Mul64(q2, m.arr[1]) 539 r3, c = bits.Add64(r3, t0, c) 540 r4, _ = bits.Add64(r4, t1, c) 541 542 t1, t0 = bits.Mul64(q1, m.arr[1]) 543 r2, c = bits.Add64(r2, t0, 0) 544 r3, c = bits.Add64(r3, t1, c) 545 _, t0 = bits.Mul64(q3, m.arr[1]) 546 r4, _ = bits.Add64(r4, t0, c) 547 548 t1, r0 = bits.Mul64(q0, m.arr[0]) 549 r1, c = bits.Add64(r1, t1, 0) 550 t1, t0 = bits.Mul64(q2, m.arr[0]) 551 r2, c = bits.Add64(r2, t0, c) 552 r3, c = bits.Add64(r3, t1, c) 553 _, t0 = bits.Mul64(q4, m.arr[0]) 554 r4, _ = bits.Add64(r4, t0, c) 555 556 t1, t0 = bits.Mul64(q1, m.arr[0]) 557 r1, c = bits.Add64(r1, t0, 0) 558 r2, c = bits.Add64(r2, t1, c) 559 t1, t0 = bits.Mul64(q3, m.arr[0]) 560 r3, c = bits.Add64(r3, t0, c) 561 r4, _ = bits.Add64(r4, t1, c) 562 563 // r = r1 - r2 564 565 var b uint64 566 567 r0, b = bits.Sub64(x0, r0, 0) 568 r1, b = bits.Sub64(x1, r1, b) 569 r2, b = bits.Sub64(x2, r2, b) 570 r3, b = bits.Sub64(x3, r3, b) 571 r4, b = bits.Sub64(x4, r4, b) 572 573 // if r<0 then r+=m 574 575 if b != 0 { 576 r0, c = bits.Add64(r0, m.arr[0], 0) 577 r1, c = bits.Add64(r1, m.arr[1], c) 578 r2, c = bits.Add64(r2, m.arr[2], c) 579 r3, c = bits.Add64(r3, m.arr[3], c) 580 r4, _ = bits.Add64(r4, 0, c) 581 } 582 583 // while (r>=m) r-=m 584 585 for { 586 // q = r - m 587 q0, b = bits.Sub64(r0, m.arr[0], 0) 588 q1, b = bits.Sub64(r1, m.arr[1], b) 589 q2, b = bits.Sub64(r2, m.arr[2], b) 590 q3, b = bits.Sub64(r3, m.arr[3], b) 591 q4, b = bits.Sub64(r4, 0, b) 592 593 // if borrow break 594 if b != 0 { 595 break 596 } 597 598 // r = q 599 r4, r3, r2, r1, r0 = q4, q3, q2, q1, q0 600 } 601 602 z.arr[3], z.arr[2], z.arr[1], z.arr[0] = r3, r2, r1, r0 603 604 return z 605 }