github.com/consensys/gnark-crypto@v0.14.0/field/generator/internal/templates/element/mul_cios.go (about) 1 package element 2 3 // MulCIOS text book CIOS works for all modulus. 4 // 5 // There are couple of variations to the multiplication (and squaring) algorithms. 6 // 7 // All versions are derived from the Montgomery CIOS algorithm: see 8 // section 2.3.2 of Tolga Acar's thesis 9 // https://www.microsoft.com/en-us/research/wp-content/uploads/1998/06/97Acar.pdf 10 // 11 // For 1-word modulus, the generator will call mul_cios_one_limb (standard REDC) 12 // 13 // For 13-word+ modulus, the generator will output a unoptimized textbook CIOS code, in plain Go. 14 // 15 // For all other moduli, we look at the available bits in the last limb. 16 // If they are none (like secp256k1) we generate a unoptimized textbook CIOS code, in plain Go, for all architectures. 17 // If there is at least one we can omit a carry propagation in the CIOS algorithm. 18 // If there is at least two we can use the same technique for the CIOS Squaring. 19 // See appendix in https://eprint.iacr.org/2022/1400.pdf for the exact condition. 20 // 21 // In practice, we have 3 different targets in mind: x86(amd64), arm64 and wasm. 22 // 23 // For amd64, we can leverage (when available) the BMI2 and ADX instructions to have 2-carry-chains in parallel. 24 // This make the use of assembly worth it as it results in a significant perf improvement; most CPUs since 2016 support these 25 // instructions, and we assume it to be the "default path"; in case the CPU has no support, we fall back to a slow, unoptimized version. 26 // 27 // On amd64, the Squaring algorithm always call the Multiplication (assembly) implementation. 28 // 29 // For arm64, we unroll the loops in the CIOS (+nocarry optimization) algorithm, such that the instructions generated 30 // by the Go compiler closely match what we would hand-write. Hence, there is no assembly needed for arm64 target. 31 // 32 // Additionally, if 2-bits+ are available on the last limb, we have a template to generate a dedicated Squaring algorithm 33 // This is not activated by default, to minimize the codebase size. 34 // On M1, AWS Graviton3 it results in a 5-10% speedup. On some mobile devices, speed up observed was more important (~20%). 35 // 36 // The same (arm64) unrolled Go code produce satisfying performance for WASM (compiled using TinyGo). 37 const MulCIOS = ` 38 {{ define "mul_cios" }} 39 var t [{{add .all.NbWords 1}}]uint64 40 var D uint64 41 var m, C uint64 42 43 {{- range $j := .all.NbWordsIndexesFull}} 44 // ----------------------------------- 45 // First loop 46 {{ if eq $j 0}} 47 C, t[0] = bits.Mul64({{$.V2}}[{{$j}}], {{$.V1}}[0]) 48 {{- range $i := $.all.NbWordsIndexesNoZero}} 49 C, t[{{$i}}] = madd1({{$.V2}}[{{$j}}], {{$.V1}}[{{$i}}], C) 50 {{- end}} 51 {{ else }} 52 C, t[0] = madd1({{$.V2}}[{{$j}}], {{$.V1}}[0], t[0]) 53 {{- range $i := $.all.NbWordsIndexesNoZero}} 54 C, t[{{$i}}] = madd2({{$.V2}}[{{$j}}], {{$.V1}}[{{$i}}], t[{{$i}}], C) 55 {{- end}} 56 {{ end }} 57 t[{{$.all.NbWords}}], D = bits.Add64(t[{{$.all.NbWords}}], C, 0) 58 59 // m = t[0]n'[0] mod W 60 m = t[0] * qInvNeg 61 62 // ----------------------------------- 63 // Second loop 64 C = madd0(m, q0, t[0]) 65 {{- range $i := $.all.NbWordsIndexesNoZero}} 66 C, t[{{sub $i 1}}] = madd2(m, q{{$i}}, t[{{$i}}], C) 67 {{- end}} 68 69 t[{{sub $.all.NbWords 1}}], C = bits.Add64(t[{{$.all.NbWords}}], C, 0) 70 t[{{$.all.NbWords}}], _ = bits.Add64(0, D, C) 71 {{- end}} 72 73 74 if t[{{$.all.NbWords}}] != 0 { 75 // we need to reduce, we have a result on {{add 1 $.all.NbWords}} words 76 {{- if gt $.all.NbWords 1}} 77 var b uint64 78 {{- end}} 79 z[0], {{- if gt $.all.NbWords 1}}b{{- else}}_{{- end}} = bits.Sub64(t[0], q0, 0) 80 {{- range $i := .all.NbWordsIndexesNoZero}} 81 {{- if eq $i $.all.NbWordsLastIndex}} 82 z[{{$i}}], _ = bits.Sub64(t[{{$i}}], q{{$i}}, b) 83 {{- else }} 84 z[{{$i}}], b = bits.Sub64(t[{{$i}}], q{{$i}}, b) 85 {{- end}} 86 {{- end}} 87 return {{if $.ReturnZ }} z{{- end}} 88 } 89 90 // copy t into z 91 {{- range $i := $.all.NbWordsIndexesFull}} 92 z[{{$i}}] = t[{{$i}}] 93 {{- end}} 94 95 {{ end }} 96 97 {{ define "mul_cios_one_limb" }} 98 // In fact, since the modulus R fits on one register, the CIOS algorithm gets reduced to standard REDC (textbook Montgomery reduction): 99 // hi, lo := x * y 100 // m := (lo * qInvNeg) mod R 101 // (*) r := (hi * R + lo + m * q) / R 102 // reduce r if necessary 103 104 // On the emphasized line, we get r = hi + (lo + m * q) / R 105 // If we write hi2, lo2 = m * q then R | m * q - lo2 ⇒ R | (lo * qInvNeg) q - lo2 = -lo - lo2 106 // This shows lo + lo2 = 0 mod R. i.e. lo + lo2 = 0 if lo = 0 and R otherwise. 107 // Which finally gives (lo + m * q) / R = (lo + lo2 + R hi2) / R = hi2 + (lo+lo2) / R = hi2 + (lo != 0) 108 // This "optimization" lets us do away with one MUL instruction on ARM architectures and is available for all q < R. 109 110 var r uint64 111 hi, lo := bits.Mul64({{$.V1}}[0], {{$.V2}}[0]) 112 if lo != 0 { 113 hi++ // x[0] * y[0] ≤ 2¹²⁸ - 2⁶⁵ + 1, meaning hi ≤ 2⁶⁴ - 2 so no need to worry about overflow 114 } 115 m := lo * qInvNeg 116 hi2, _ := bits.Mul64(m, q) 117 r, carry := bits.Add64(hi2, hi, 0) 118 119 if carry != 0 || r >= q { 120 // we need to reduce 121 r -= q 122 } 123 z[0] = r 124 {{ end }} 125 ` 126 127 const MulDoc = ` 128 {{define "mul_doc noCarry"}} 129 // Implements CIOS multiplication -- section 2.3.2 of Tolga Acar's thesis 130 // https://www.microsoft.com/en-us/research/wp-content/uploads/1998/06/97Acar.pdf 131 // 132 // The algorithm: 133 // 134 // for i=0 to N-1 135 // C := 0 136 // for j=0 to N-1 137 // (C,t[j]) := t[j] + x[j]*y[i] + C 138 // (t[N+1],t[N]) := t[N] + C 139 // 140 // C := 0 141 // m := t[0]*q'[0] mod D 142 // (C,_) := t[0] + m*q[0] 143 // for j=1 to N-1 144 // (C,t[j-1]) := t[j] + m*q[j] + C 145 // 146 // (C,t[N-1]) := t[N] + C 147 // t[N] := t[N+1] + C 148 // 149 // → N is the number of machine words needed to store the modulus q 150 // → D is the word size. For example, on a 64-bit architecture D is 2 64 151 // → x[i], y[i], q[i] is the ith word of the numbers x,y,q 152 // → q'[0] is the lowest word of the number -q⁻¹ mod r. This quantity is pre-computed, as it does not depend on the inputs. 153 // → t is a temporary array of size N+2 154 // → C, S are machine words. A pair (C,S) refers to (hi-bits, lo-bits) of a two-word number 155 {{- if .noCarry}} 156 // 157 // As described here https://hackmd.io/@gnark/modular_multiplication we can get rid of one carry chain and simplify: 158 // (also described in https://eprint.iacr.org/2022/1400.pdf annex) 159 // 160 // for i=0 to N-1 161 // (A,t[0]) := t[0] + x[0]*y[i] 162 // m := t[0]*q'[0] mod W 163 // C,_ := t[0] + m*q[0] 164 // for j=1 to N-1 165 // (A,t[j]) := t[j] + x[j]*y[i] + A 166 // (C,t[j-1]) := t[j] + m*q[j] + C 167 // 168 // t[N-1] = C + A 169 // 170 // This optimization saves 5N + 2 additions in the algorithm, and can be used whenever the highest bit 171 // of the modulus is zero (and not all of the remaining bits are set). 172 {{- end}} 173 {{ end }} 174 `