github.com/consensys/gnark-crypto@v0.14.0/field/generator/internal/templates/element/mul_cios.go (about)

     1  package element
     2  
     3  // MulCIOS text book CIOS works for all modulus.
     4  //
     5  // There are couple of variations to the multiplication (and squaring) algorithms.
     6  //
     7  // All versions are derived from the Montgomery CIOS algorithm: see
     8  // section 2.3.2 of Tolga Acar's thesis
     9  // https://www.microsoft.com/en-us/research/wp-content/uploads/1998/06/97Acar.pdf
    10  //
    11  // For 1-word modulus, the generator will call mul_cios_one_limb (standard REDC)
    12  //
    13  // For 13-word+ modulus, the generator will output a unoptimized textbook CIOS code, in plain Go.
    14  //
    15  // For all other moduli, we look at the available bits in the last limb.
    16  // If they are none (like secp256k1) we generate a unoptimized textbook CIOS code, in plain Go, for all architectures.
    17  // If there is at least one we can omit a carry propagation in the CIOS algorithm.
    18  // If there is at least two we can use the same technique for the CIOS Squaring.
    19  // See appendix in https://eprint.iacr.org/2022/1400.pdf for the exact condition.
    20  //
    21  // In practice, we have 3 different targets in mind: x86(amd64), arm64 and wasm.
    22  //
    23  // For amd64, we can leverage (when available) the BMI2 and ADX instructions to have 2-carry-chains in parallel.
    24  // This make the use of assembly worth it as it results in a significant perf improvement; most CPUs since 2016 support these
    25  // instructions, and we assume it to be the "default path"; in case the CPU has no support, we fall back to a slow, unoptimized version.
    26  //
    27  // On amd64, the Squaring algorithm always call the Multiplication (assembly) implementation.
    28  //
    29  // For arm64, we unroll the loops in the CIOS (+nocarry optimization) algorithm, such that the instructions generated
    30  // by the Go compiler closely match what we would hand-write. Hence, there is no assembly needed for arm64 target.
    31  //
    32  // Additionally, if 2-bits+ are available on the last limb, we have a template to generate a dedicated Squaring algorithm
    33  // This is not activated by default, to minimize the codebase size.
    34  // On M1, AWS Graviton3 it results in a 5-10% speedup. On some mobile devices, speed up observed was more important (~20%).
    35  //
    36  // The same (arm64) unrolled Go code produce satisfying performance for WASM (compiled using TinyGo).
    37  const MulCIOS = `
    38  {{ define "mul_cios" }}
    39  	var t [{{add .all.NbWords 1}}]uint64
    40  	var D uint64
    41  	var m, C uint64
    42  
    43  	{{- range $j := .all.NbWordsIndexesFull}}
    44  		// -----------------------------------
    45  		// First loop
    46  		{{ if eq $j 0}}
    47  			C, t[0] = bits.Mul64({{$.V2}}[{{$j}}], {{$.V1}}[0])
    48  			{{- range $i := $.all.NbWordsIndexesNoZero}}
    49  				C, t[{{$i}}] = madd1({{$.V2}}[{{$j}}], {{$.V1}}[{{$i}}], C)
    50  			{{- end}}
    51  		{{ else }}
    52  			C, t[0] = madd1({{$.V2}}[{{$j}}], {{$.V1}}[0], t[0])
    53  			{{- range $i := $.all.NbWordsIndexesNoZero}}
    54  				C, t[{{$i}}] = madd2({{$.V2}}[{{$j}}], {{$.V1}}[{{$i}}], t[{{$i}}], C)
    55  			{{- end}}
    56  		{{ end }}
    57  		t[{{$.all.NbWords}}], D = bits.Add64(t[{{$.all.NbWords}}], C, 0)
    58  
    59  		// m = t[0]n'[0] mod W
    60  		m = t[0] * qInvNeg
    61  
    62  		// -----------------------------------
    63  		// Second loop
    64  		C = madd0(m, q0, t[0])
    65  		{{- range $i := $.all.NbWordsIndexesNoZero}}
    66  				C, t[{{sub $i 1}}] = madd2(m, q{{$i}}, t[{{$i}}], C)
    67  		{{- end}}
    68  
    69  		 t[{{sub $.all.NbWords 1}}], C = bits.Add64(t[{{$.all.NbWords}}], C, 0)
    70  		 t[{{$.all.NbWords}}], _ = bits.Add64(0, D, C)
    71  	{{- end}}
    72  
    73  
    74  	if t[{{$.all.NbWords}}] != 0 {
    75  		// we need to reduce, we have a result on {{add 1 $.all.NbWords}} words
    76  		{{- if gt $.all.NbWords 1}}
    77  		var b uint64
    78  		{{- end}}
    79  		z[0], {{- if gt $.all.NbWords 1}}b{{- else}}_{{- end}} = bits.Sub64(t[0], q0, 0)
    80  		{{- range $i := .all.NbWordsIndexesNoZero}}
    81  			{{-  if eq $i $.all.NbWordsLastIndex}}
    82  				z[{{$i}}], _ = bits.Sub64(t[{{$i}}], q{{$i}}, b)
    83  			{{-  else  }}
    84  				z[{{$i}}], b = bits.Sub64(t[{{$i}}], q{{$i}}, b)
    85  			{{- end}}
    86  		{{- end}}
    87  		return {{if $.ReturnZ }} z{{- end}}
    88  	}
    89  
    90  	// copy t into z 
    91  	{{- range $i := $.all.NbWordsIndexesFull}}
    92  		z[{{$i}}] = t[{{$i}}]
    93  	{{- end}}
    94  
    95  {{ end }}
    96  
    97  {{ define "mul_cios_one_limb" }}
    98  	// In fact, since the modulus R fits on one register, the CIOS algorithm gets reduced to standard REDC (textbook Montgomery reduction):
    99  	// hi, lo := x * y
   100  	// m := (lo * qInvNeg) mod R
   101  	// (*) r := (hi * R + lo + m * q) / R
   102  	// reduce r if necessary
   103  
   104  	// On the emphasized line, we get r = hi + (lo + m * q) / R
   105  	// If we write hi2, lo2 = m * q then R | m * q - lo2 ⇒ R | (lo * qInvNeg) q - lo2 = -lo - lo2
   106  	// This shows lo + lo2 = 0 mod R. i.e. lo + lo2 = 0 if lo = 0 and R otherwise.
   107  	// Which finally gives (lo + m * q) / R = (lo + lo2 + R hi2) / R = hi2 + (lo+lo2) / R = hi2 + (lo != 0)
   108  	// This "optimization" lets us do away with one MUL instruction on ARM architectures and is available for all q < R.
   109  
   110  	var r uint64
   111  	hi, lo := bits.Mul64({{$.V1}}[0], {{$.V2}}[0])
   112  	if lo != 0 {
   113  		hi++ // x[0] * y[0] ≤ 2¹²⁸ - 2⁶⁵ + 1, meaning hi ≤ 2⁶⁴ - 2 so no need to worry about overflow
   114  	}
   115  	m := lo * qInvNeg
   116  	hi2, _ := bits.Mul64(m, q)
   117  	r, carry := bits.Add64(hi2, hi, 0)
   118  
   119  	if carry != 0 || r >= q  {
   120  		// we need to reduce
   121  		r -= q 
   122  	}
   123  	z[0] = r 
   124  {{ end }}
   125  `
   126  
   127  const MulDoc = `
   128  {{define "mul_doc noCarry"}}
   129  // Implements CIOS multiplication -- section 2.3.2 of Tolga Acar's thesis
   130  // https://www.microsoft.com/en-us/research/wp-content/uploads/1998/06/97Acar.pdf
   131  // 
   132  // The algorithm:
   133  // 
   134  // for i=0 to N-1
   135  // 		C := 0
   136  // 		for j=0 to N-1
   137  // 			(C,t[j]) := t[j] + x[j]*y[i] + C
   138  // 		(t[N+1],t[N]) := t[N] + C
   139  //		
   140  // 		C := 0
   141  // 		m := t[0]*q'[0] mod D
   142  // 		(C,_) := t[0] + m*q[0]
   143  // 		for j=1 to N-1
   144  // 			(C,t[j-1]) := t[j] + m*q[j] + C
   145  //		
   146  // 		(C,t[N-1]) := t[N] + C
   147  // 		t[N] := t[N+1] + C
   148  //
   149  // → N is the number of machine words needed to store the modulus q
   150  // → D is the word size. For example, on a 64-bit architecture D is 2	64
   151  // → x[i], y[i], q[i] is the ith word of the numbers x,y,q
   152  // → q'[0] is the lowest word of the number -q⁻¹ mod r. This quantity is pre-computed, as it does not depend on the inputs.
   153  // → t is a temporary array of size N+2 
   154  // → C, S are machine words. A pair (C,S) refers to (hi-bits, lo-bits) of a two-word number
   155  {{- if .noCarry}}
   156  // 
   157  // As described here https://hackmd.io/@gnark/modular_multiplication we can get rid of one carry chain and simplify:
   158  // (also described in https://eprint.iacr.org/2022/1400.pdf annex)
   159  //
   160  // for i=0 to N-1
   161  // 		(A,t[0]) := t[0] + x[0]*y[i] 
   162  // 		m := t[0]*q'[0] mod W
   163  // 		C,_ := t[0] + m*q[0]
   164  // 		for j=1 to N-1
   165  // 			(A,t[j])  := t[j] + x[j]*y[i] + A
   166  // 			(C,t[j-1]) := t[j] + m*q[j] + C
   167  //		
   168  // 		t[N-1] = C + A
   169  //
   170  // This optimization saves 5N + 2 additions in the algorithm, and can be used whenever the highest bit 
   171  // of the modulus is zero (and not all of the remaining bits are set).
   172  {{- end}}
   173  {{ end }}
   174  `