github.com/consensys/gnark-crypto@v0.14.0/ecc/bw6-761/fp/element.go (about)

     1  // Copyright 2020 ConsenSys Software Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Code generated by consensys/gnark-crypto DO NOT EDIT
    16  
    17  package fp
    18  
    19  import (
    20  	"crypto/rand"
    21  	"encoding/binary"
    22  	"errors"
    23  	"io"
    24  	"math/big"
    25  	"math/bits"
    26  	"reflect"
    27  	"strconv"
    28  	"strings"
    29  
    30  	"github.com/bits-and-blooms/bitset"
    31  	"github.com/consensys/gnark-crypto/field/hash"
    32  	"github.com/consensys/gnark-crypto/field/pool"
    33  )
    34  
    35  // Element represents a field element stored on 12 words (uint64)
    36  //
    37  // Element are assumed to be in Montgomery form in all methods.
    38  //
    39  // Modulus q =
    40  //
    41  //	q[base10] = 6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068299
    42  //	q[base16] = 0x122e824fb83ce0ad187c94004faff3eb926186a81d14688528275ef8087be41707ba638e584e91903cebaff25b423048689c8ed12f9fd9071dcd3dc73ebff2e98a116c25667a8f8160cf8aeeaf0a437e6913e6870000082f49d00000000008b
    43  //
    44  // # Warning
    45  //
    46  // This code has not been audited and is provided as-is. In particular, there is no security guarantees such as constant time implementation or side-channel attack resistance.
    47  type Element [12]uint64
    48  
    49  const (
    50  	Limbs = 12  // number of 64 bits words needed to represent a Element
    51  	Bits  = 761 // number of bits needed to represent a Element
    52  	Bytes = 96  // number of bytes needed to represent a Element
    53  )
    54  
    55  // Field modulus q
    56  const (
    57  	q0  uint64 = 17626244516597989515
    58  	q1  uint64 = 16614129118623039618
    59  	q2  uint64 = 1588918198704579639
    60  	q3  uint64 = 10998096788944562424
    61  	q4  uint64 = 8204665564953313070
    62  	q5  uint64 = 9694500593442880912
    63  	q6  uint64 = 274362232328168196
    64  	q7  uint64 = 8105254717682411801
    65  	q8  uint64 = 5945444129596489281
    66  	q9  uint64 = 13341377791855249032
    67  	q10 uint64 = 15098257552581525310
    68  	q11 uint64 = 81882988782276106
    69  )
    70  
    71  var qElement = Element{
    72  	q0,
    73  	q1,
    74  	q2,
    75  	q3,
    76  	q4,
    77  	q5,
    78  	q6,
    79  	q7,
    80  	q8,
    81  	q9,
    82  	q10,
    83  	q11,
    84  }
    85  
    86  var _modulus big.Int // q stored as big.Int
    87  
    88  // Modulus returns q as a big.Int
    89  //
    90  //	q[base10] = 6891450384315732539396789682275657542479668912536150109513790160209623422243491736087683183289411687640864567753786613451161759120554247759349511699125301598951605099378508850372543631423596795951899700429969112842764913119068299
    91  //	q[base16] = 0x122e824fb83ce0ad187c94004faff3eb926186a81d14688528275ef8087be41707ba638e584e91903cebaff25b423048689c8ed12f9fd9071dcd3dc73ebff2e98a116c25667a8f8160cf8aeeaf0a437e6913e6870000082f49d00000000008b
    92  func Modulus() *big.Int {
    93  	return new(big.Int).Set(&_modulus)
    94  }
    95  
    96  // q + r'.r = 1, i.e., qInvNeg = - q⁻¹ mod r
    97  // used for Montgomery reduction
    98  const qInvNeg uint64 = 744663313386281181
    99  
   100  func init() {
   101  	_modulus.SetString("122e824fb83ce0ad187c94004faff3eb926186a81d14688528275ef8087be41707ba638e584e91903cebaff25b423048689c8ed12f9fd9071dcd3dc73ebff2e98a116c25667a8f8160cf8aeeaf0a437e6913e6870000082f49d00000000008b", 16)
   102  }
   103  
   104  // NewElement returns a new Element from a uint64 value
   105  //
   106  // it is equivalent to
   107  //
   108  //	var v Element
   109  //	v.SetUint64(...)
   110  func NewElement(v uint64) Element {
   111  	z := Element{v}
   112  	z.Mul(&z, &rSquare)
   113  	return z
   114  }
   115  
   116  // SetUint64 sets z to v and returns z
   117  func (z *Element) SetUint64(v uint64) *Element {
   118  	//  sets z LSB to v (non-Montgomery form) and convert z to Montgomery form
   119  	*z = Element{v}
   120  	return z.Mul(z, &rSquare) // z.toMont()
   121  }
   122  
   123  // SetInt64 sets z to v and returns z
   124  func (z *Element) SetInt64(v int64) *Element {
   125  
   126  	// absolute value of v
   127  	m := v >> 63
   128  	z.SetUint64(uint64((v ^ m) - m))
   129  
   130  	if m != 0 {
   131  		// v is negative
   132  		z.Neg(z)
   133  	}
   134  
   135  	return z
   136  }
   137  
   138  // Set z = x and returns z
   139  func (z *Element) Set(x *Element) *Element {
   140  	z[0] = x[0]
   141  	z[1] = x[1]
   142  	z[2] = x[2]
   143  	z[3] = x[3]
   144  	z[4] = x[4]
   145  	z[5] = x[5]
   146  	z[6] = x[6]
   147  	z[7] = x[7]
   148  	z[8] = x[8]
   149  	z[9] = x[9]
   150  	z[10] = x[10]
   151  	z[11] = x[11]
   152  	return z
   153  }
   154  
   155  // SetInterface converts provided interface into Element
   156  // returns an error if provided type is not supported
   157  // supported types:
   158  //
   159  //	Element
   160  //	*Element
   161  //	uint64
   162  //	int
   163  //	string (see SetString for valid formats)
   164  //	*big.Int
   165  //	big.Int
   166  //	[]byte
   167  func (z *Element) SetInterface(i1 interface{}) (*Element, error) {
   168  	if i1 == nil {
   169  		return nil, errors.New("can't set fp.Element with <nil>")
   170  	}
   171  
   172  	switch c1 := i1.(type) {
   173  	case Element:
   174  		return z.Set(&c1), nil
   175  	case *Element:
   176  		if c1 == nil {
   177  			return nil, errors.New("can't set fp.Element with <nil>")
   178  		}
   179  		return z.Set(c1), nil
   180  	case uint8:
   181  		return z.SetUint64(uint64(c1)), nil
   182  	case uint16:
   183  		return z.SetUint64(uint64(c1)), nil
   184  	case uint32:
   185  		return z.SetUint64(uint64(c1)), nil
   186  	case uint:
   187  		return z.SetUint64(uint64(c1)), nil
   188  	case uint64:
   189  		return z.SetUint64(c1), nil
   190  	case int8:
   191  		return z.SetInt64(int64(c1)), nil
   192  	case int16:
   193  		return z.SetInt64(int64(c1)), nil
   194  	case int32:
   195  		return z.SetInt64(int64(c1)), nil
   196  	case int64:
   197  		return z.SetInt64(c1), nil
   198  	case int:
   199  		return z.SetInt64(int64(c1)), nil
   200  	case string:
   201  		return z.SetString(c1)
   202  	case *big.Int:
   203  		if c1 == nil {
   204  			return nil, errors.New("can't set fp.Element with <nil>")
   205  		}
   206  		return z.SetBigInt(c1), nil
   207  	case big.Int:
   208  		return z.SetBigInt(&c1), nil
   209  	case []byte:
   210  		return z.SetBytes(c1), nil
   211  	default:
   212  		return nil, errors.New("can't set fp.Element from type " + reflect.TypeOf(i1).String())
   213  	}
   214  }
   215  
   216  // SetZero z = 0
   217  func (z *Element) SetZero() *Element {
   218  	z[0] = 0
   219  	z[1] = 0
   220  	z[2] = 0
   221  	z[3] = 0
   222  	z[4] = 0
   223  	z[5] = 0
   224  	z[6] = 0
   225  	z[7] = 0
   226  	z[8] = 0
   227  	z[9] = 0
   228  	z[10] = 0
   229  	z[11] = 0
   230  	return z
   231  }
   232  
   233  // SetOne z = 1 (in Montgomery form)
   234  func (z *Element) SetOne() *Element {
   235  	z[0] = 144959613005956565
   236  	z[1] = 6509995272855063783
   237  	z[2] = 11428286765660613342
   238  	z[3] = 15738672438262922740
   239  	z[4] = 17071399330169272331
   240  	z[5] = 13899911246788437003
   241  	z[6] = 12055474021000362245
   242  	z[7] = 2545351818702954755
   243  	z[8] = 8887388221587179644
   244  	z[9] = 5009280847225881135
   245  	z[10] = 15539704305423854047
   246  	z[11] = 23071597697427581
   247  	return z
   248  }
   249  
   250  // Div z = x*y⁻¹ (mod q)
   251  func (z *Element) Div(x, y *Element) *Element {
   252  	var yInv Element
   253  	yInv.Inverse(y)
   254  	z.Mul(x, &yInv)
   255  	return z
   256  }
   257  
   258  // Equal returns z == x; constant-time
   259  func (z *Element) Equal(x *Element) bool {
   260  	return z.NotEqual(x) == 0
   261  }
   262  
   263  // NotEqual returns 0 if and only if z == x; constant-time
   264  func (z *Element) NotEqual(x *Element) uint64 {
   265  	return (z[11] ^ x[11]) | (z[10] ^ x[10]) | (z[9] ^ x[9]) | (z[8] ^ x[8]) | (z[7] ^ x[7]) | (z[6] ^ x[6]) | (z[5] ^ x[5]) | (z[4] ^ x[4]) | (z[3] ^ x[3]) | (z[2] ^ x[2]) | (z[1] ^ x[1]) | (z[0] ^ x[0])
   266  }
   267  
   268  // IsZero returns z == 0
   269  func (z *Element) IsZero() bool {
   270  	return (z[11] | z[10] | z[9] | z[8] | z[7] | z[6] | z[5] | z[4] | z[3] | z[2] | z[1] | z[0]) == 0
   271  }
   272  
   273  // IsOne returns z == 1
   274  func (z *Element) IsOne() bool {
   275  	return ((z[11] ^ 23071597697427581) | (z[10] ^ 15539704305423854047) | (z[9] ^ 5009280847225881135) | (z[8] ^ 8887388221587179644) | (z[7] ^ 2545351818702954755) | (z[6] ^ 12055474021000362245) | (z[5] ^ 13899911246788437003) | (z[4] ^ 17071399330169272331) | (z[3] ^ 15738672438262922740) | (z[2] ^ 11428286765660613342) | (z[1] ^ 6509995272855063783) | (z[0] ^ 144959613005956565)) == 0
   276  }
   277  
   278  // IsUint64 reports whether z can be represented as an uint64.
   279  func (z *Element) IsUint64() bool {
   280  	zz := *z
   281  	zz.fromMont()
   282  	return zz.FitsOnOneWord()
   283  }
   284  
   285  // Uint64 returns the uint64 representation of x. If x cannot be represented in a uint64, the result is undefined.
   286  func (z *Element) Uint64() uint64 {
   287  	return z.Bits()[0]
   288  }
   289  
   290  // FitsOnOneWord reports whether z words (except the least significant word) are 0
   291  //
   292  // It is the responsibility of the caller to convert from Montgomery to Regular form if needed.
   293  func (z *Element) FitsOnOneWord() bool {
   294  	return (z[11] | z[10] | z[9] | z[8] | z[7] | z[6] | z[5] | z[4] | z[3] | z[2] | z[1]) == 0
   295  }
   296  
   297  // Cmp compares (lexicographic order) z and x and returns:
   298  //
   299  //	-1 if z <  x
   300  //	 0 if z == x
   301  //	+1 if z >  x
   302  func (z *Element) Cmp(x *Element) int {
   303  	_z := z.Bits()
   304  	_x := x.Bits()
   305  	if _z[11] > _x[11] {
   306  		return 1
   307  	} else if _z[11] < _x[11] {
   308  		return -1
   309  	}
   310  	if _z[10] > _x[10] {
   311  		return 1
   312  	} else if _z[10] < _x[10] {
   313  		return -1
   314  	}
   315  	if _z[9] > _x[9] {
   316  		return 1
   317  	} else if _z[9] < _x[9] {
   318  		return -1
   319  	}
   320  	if _z[8] > _x[8] {
   321  		return 1
   322  	} else if _z[8] < _x[8] {
   323  		return -1
   324  	}
   325  	if _z[7] > _x[7] {
   326  		return 1
   327  	} else if _z[7] < _x[7] {
   328  		return -1
   329  	}
   330  	if _z[6] > _x[6] {
   331  		return 1
   332  	} else if _z[6] < _x[6] {
   333  		return -1
   334  	}
   335  	if _z[5] > _x[5] {
   336  		return 1
   337  	} else if _z[5] < _x[5] {
   338  		return -1
   339  	}
   340  	if _z[4] > _x[4] {
   341  		return 1
   342  	} else if _z[4] < _x[4] {
   343  		return -1
   344  	}
   345  	if _z[3] > _x[3] {
   346  		return 1
   347  	} else if _z[3] < _x[3] {
   348  		return -1
   349  	}
   350  	if _z[2] > _x[2] {
   351  		return 1
   352  	} else if _z[2] < _x[2] {
   353  		return -1
   354  	}
   355  	if _z[1] > _x[1] {
   356  		return 1
   357  	} else if _z[1] < _x[1] {
   358  		return -1
   359  	}
   360  	if _z[0] > _x[0] {
   361  		return 1
   362  	} else if _z[0] < _x[0] {
   363  		return -1
   364  	}
   365  	return 0
   366  }
   367  
   368  // LexicographicallyLargest returns true if this element is strictly lexicographically
   369  // larger than its negation, false otherwise
   370  func (z *Element) LexicographicallyLargest() bool {
   371  	// adapted from github.com/zkcrypto/bls12_381
   372  	// we check if the element is larger than (q-1) / 2
   373  	// if z - (((q -1) / 2) + 1) have no underflow, then z > (q-1) / 2
   374  
   375  	_z := z.Bits()
   376  
   377  	var b uint64
   378  	_, b = bits.Sub64(_z[0], 8813122258298994758, 0)
   379  	_, b = bits.Sub64(_z[1], 17530436596166295617, b)
   380  	_, b = bits.Sub64(_z[2], 794459099352289819, b)
   381  	_, b = bits.Sub64(_z[3], 5499048394472281212, b)
   382  	_, b = bits.Sub64(_z[4], 4102332782476656535, b)
   383  	_, b = bits.Sub64(_z[5], 4847250296721440456, b)
   384  	_, b = bits.Sub64(_z[6], 9360553153018859906, b)
   385  	_, b = bits.Sub64(_z[7], 13275999395695981708, b)
   386  	_, b = bits.Sub64(_z[8], 2972722064798244640, b)
   387  	_, b = bits.Sub64(_z[9], 6670688895927624516, b)
   388  	_, b = bits.Sub64(_z[10], 7549128776290762655, b)
   389  	_, b = bits.Sub64(_z[11], 40941494391138053, b)
   390  
   391  	return b == 0
   392  }
   393  
   394  // SetRandom sets z to a uniform random value in [0, q).
   395  //
   396  // This might error only if reading from crypto/rand.Reader errors,
   397  // in which case, value of z is undefined.
   398  func (z *Element) SetRandom() (*Element, error) {
   399  	// this code is generated for all modulus
   400  	// and derived from go/src/crypto/rand/util.go
   401  
   402  	// l is number of limbs * 8; the number of bytes needed to reconstruct 12 uint64
   403  	const l = 96
   404  
   405  	// bitLen is the maximum bit length needed to encode a value < q.
   406  	const bitLen = 761
   407  
   408  	// k is the maximum byte length needed to encode a value < q.
   409  	const k = (bitLen + 7) / 8
   410  
   411  	// b is the number of bits in the most significant byte of q-1.
   412  	b := uint(bitLen % 8)
   413  	if b == 0 {
   414  		b = 8
   415  	}
   416  
   417  	var bytes [l]byte
   418  
   419  	for {
   420  		// note that bytes[k:l] is always 0
   421  		if _, err := io.ReadFull(rand.Reader, bytes[:k]); err != nil {
   422  			return nil, err
   423  		}
   424  
   425  		// Clear unused bits in in the most significant byte to increase probability
   426  		// that the candidate is < q.
   427  		bytes[k-1] &= uint8(int(1<<b) - 1)
   428  		z[0] = binary.LittleEndian.Uint64(bytes[0:8])
   429  		z[1] = binary.LittleEndian.Uint64(bytes[8:16])
   430  		z[2] = binary.LittleEndian.Uint64(bytes[16:24])
   431  		z[3] = binary.LittleEndian.Uint64(bytes[24:32])
   432  		z[4] = binary.LittleEndian.Uint64(bytes[32:40])
   433  		z[5] = binary.LittleEndian.Uint64(bytes[40:48])
   434  		z[6] = binary.LittleEndian.Uint64(bytes[48:56])
   435  		z[7] = binary.LittleEndian.Uint64(bytes[56:64])
   436  		z[8] = binary.LittleEndian.Uint64(bytes[64:72])
   437  		z[9] = binary.LittleEndian.Uint64(bytes[72:80])
   438  		z[10] = binary.LittleEndian.Uint64(bytes[80:88])
   439  		z[11] = binary.LittleEndian.Uint64(bytes[88:96])
   440  
   441  		if !z.smallerThanModulus() {
   442  			continue // ignore the candidate and re-sample
   443  		}
   444  
   445  		return z, nil
   446  	}
   447  }
   448  
   449  // smallerThanModulus returns true if z < q
   450  // This is not constant time
   451  func (z *Element) smallerThanModulus() bool {
   452  	return (z[11] < q11 || (z[11] == q11 && (z[10] < q10 || (z[10] == q10 && (z[9] < q9 || (z[9] == q9 && (z[8] < q8 || (z[8] == q8 && (z[7] < q7 || (z[7] == q7 && (z[6] < q6 || (z[6] == q6 && (z[5] < q5 || (z[5] == q5 && (z[4] < q4 || (z[4] == q4 && (z[3] < q3 || (z[3] == q3 && (z[2] < q2 || (z[2] == q2 && (z[1] < q1 || (z[1] == q1 && (z[0] < q0)))))))))))))))))))))))
   453  }
   454  
   455  // One returns 1
   456  func One() Element {
   457  	var one Element
   458  	one.SetOne()
   459  	return one
   460  }
   461  
   462  // Halve sets z to z / 2 (mod q)
   463  func (z *Element) Halve() {
   464  	var carry uint64
   465  
   466  	if z[0]&1 == 1 {
   467  		// z = z + q
   468  		z[0], carry = bits.Add64(z[0], q0, 0)
   469  		z[1], carry = bits.Add64(z[1], q1, carry)
   470  		z[2], carry = bits.Add64(z[2], q2, carry)
   471  		z[3], carry = bits.Add64(z[3], q3, carry)
   472  		z[4], carry = bits.Add64(z[4], q4, carry)
   473  		z[5], carry = bits.Add64(z[5], q5, carry)
   474  		z[6], carry = bits.Add64(z[6], q6, carry)
   475  		z[7], carry = bits.Add64(z[7], q7, carry)
   476  		z[8], carry = bits.Add64(z[8], q8, carry)
   477  		z[9], carry = bits.Add64(z[9], q9, carry)
   478  		z[10], carry = bits.Add64(z[10], q10, carry)
   479  		z[11], _ = bits.Add64(z[11], q11, carry)
   480  
   481  	}
   482  	// z = z >> 1
   483  	z[0] = z[0]>>1 | z[1]<<63
   484  	z[1] = z[1]>>1 | z[2]<<63
   485  	z[2] = z[2]>>1 | z[3]<<63
   486  	z[3] = z[3]>>1 | z[4]<<63
   487  	z[4] = z[4]>>1 | z[5]<<63
   488  	z[5] = z[5]>>1 | z[6]<<63
   489  	z[6] = z[6]>>1 | z[7]<<63
   490  	z[7] = z[7]>>1 | z[8]<<63
   491  	z[8] = z[8]>>1 | z[9]<<63
   492  	z[9] = z[9]>>1 | z[10]<<63
   493  	z[10] = z[10]>>1 | z[11]<<63
   494  	z[11] >>= 1
   495  
   496  }
   497  
   498  // fromMont converts z in place (i.e. mutates) from Montgomery to regular representation
   499  // sets and returns z = z * 1
   500  func (z *Element) fromMont() *Element {
   501  	fromMont(z)
   502  	return z
   503  }
   504  
   505  // Add z = x + y (mod q)
   506  func (z *Element) Add(x, y *Element) *Element {
   507  
   508  	var carry uint64
   509  	z[0], carry = bits.Add64(x[0], y[0], 0)
   510  	z[1], carry = bits.Add64(x[1], y[1], carry)
   511  	z[2], carry = bits.Add64(x[2], y[2], carry)
   512  	z[3], carry = bits.Add64(x[3], y[3], carry)
   513  	z[4], carry = bits.Add64(x[4], y[4], carry)
   514  	z[5], carry = bits.Add64(x[5], y[5], carry)
   515  	z[6], carry = bits.Add64(x[6], y[6], carry)
   516  	z[7], carry = bits.Add64(x[7], y[7], carry)
   517  	z[8], carry = bits.Add64(x[8], y[8], carry)
   518  	z[9], carry = bits.Add64(x[9], y[9], carry)
   519  	z[10], carry = bits.Add64(x[10], y[10], carry)
   520  	z[11], _ = bits.Add64(x[11], y[11], carry)
   521  
   522  	// if z ⩾ q → z -= q
   523  	if !z.smallerThanModulus() {
   524  		var b uint64
   525  		z[0], b = bits.Sub64(z[0], q0, 0)
   526  		z[1], b = bits.Sub64(z[1], q1, b)
   527  		z[2], b = bits.Sub64(z[2], q2, b)
   528  		z[3], b = bits.Sub64(z[3], q3, b)
   529  		z[4], b = bits.Sub64(z[4], q4, b)
   530  		z[5], b = bits.Sub64(z[5], q5, b)
   531  		z[6], b = bits.Sub64(z[6], q6, b)
   532  		z[7], b = bits.Sub64(z[7], q7, b)
   533  		z[8], b = bits.Sub64(z[8], q8, b)
   534  		z[9], b = bits.Sub64(z[9], q9, b)
   535  		z[10], b = bits.Sub64(z[10], q10, b)
   536  		z[11], _ = bits.Sub64(z[11], q11, b)
   537  	}
   538  	return z
   539  }
   540  
   541  // Double z = x + x (mod q), aka Lsh 1
   542  func (z *Element) Double(x *Element) *Element {
   543  
   544  	var carry uint64
   545  	z[0], carry = bits.Add64(x[0], x[0], 0)
   546  	z[1], carry = bits.Add64(x[1], x[1], carry)
   547  	z[2], carry = bits.Add64(x[2], x[2], carry)
   548  	z[3], carry = bits.Add64(x[3], x[3], carry)
   549  	z[4], carry = bits.Add64(x[4], x[4], carry)
   550  	z[5], carry = bits.Add64(x[5], x[5], carry)
   551  	z[6], carry = bits.Add64(x[6], x[6], carry)
   552  	z[7], carry = bits.Add64(x[7], x[7], carry)
   553  	z[8], carry = bits.Add64(x[8], x[8], carry)
   554  	z[9], carry = bits.Add64(x[9], x[9], carry)
   555  	z[10], carry = bits.Add64(x[10], x[10], carry)
   556  	z[11], _ = bits.Add64(x[11], x[11], carry)
   557  
   558  	// if z ⩾ q → z -= q
   559  	if !z.smallerThanModulus() {
   560  		var b uint64
   561  		z[0], b = bits.Sub64(z[0], q0, 0)
   562  		z[1], b = bits.Sub64(z[1], q1, b)
   563  		z[2], b = bits.Sub64(z[2], q2, b)
   564  		z[3], b = bits.Sub64(z[3], q3, b)
   565  		z[4], b = bits.Sub64(z[4], q4, b)
   566  		z[5], b = bits.Sub64(z[5], q5, b)
   567  		z[6], b = bits.Sub64(z[6], q6, b)
   568  		z[7], b = bits.Sub64(z[7], q7, b)
   569  		z[8], b = bits.Sub64(z[8], q8, b)
   570  		z[9], b = bits.Sub64(z[9], q9, b)
   571  		z[10], b = bits.Sub64(z[10], q10, b)
   572  		z[11], _ = bits.Sub64(z[11], q11, b)
   573  	}
   574  	return z
   575  }
   576  
   577  // Sub z = x - y (mod q)
   578  func (z *Element) Sub(x, y *Element) *Element {
   579  	var b uint64
   580  	z[0], b = bits.Sub64(x[0], y[0], 0)
   581  	z[1], b = bits.Sub64(x[1], y[1], b)
   582  	z[2], b = bits.Sub64(x[2], y[2], b)
   583  	z[3], b = bits.Sub64(x[3], y[3], b)
   584  	z[4], b = bits.Sub64(x[4], y[4], b)
   585  	z[5], b = bits.Sub64(x[5], y[5], b)
   586  	z[6], b = bits.Sub64(x[6], y[6], b)
   587  	z[7], b = bits.Sub64(x[7], y[7], b)
   588  	z[8], b = bits.Sub64(x[8], y[8], b)
   589  	z[9], b = bits.Sub64(x[9], y[9], b)
   590  	z[10], b = bits.Sub64(x[10], y[10], b)
   591  	z[11], b = bits.Sub64(x[11], y[11], b)
   592  	if b != 0 {
   593  		var c uint64
   594  		z[0], c = bits.Add64(z[0], q0, 0)
   595  		z[1], c = bits.Add64(z[1], q1, c)
   596  		z[2], c = bits.Add64(z[2], q2, c)
   597  		z[3], c = bits.Add64(z[3], q3, c)
   598  		z[4], c = bits.Add64(z[4], q4, c)
   599  		z[5], c = bits.Add64(z[5], q5, c)
   600  		z[6], c = bits.Add64(z[6], q6, c)
   601  		z[7], c = bits.Add64(z[7], q7, c)
   602  		z[8], c = bits.Add64(z[8], q8, c)
   603  		z[9], c = bits.Add64(z[9], q9, c)
   604  		z[10], c = bits.Add64(z[10], q10, c)
   605  		z[11], _ = bits.Add64(z[11], q11, c)
   606  	}
   607  	return z
   608  }
   609  
   610  // Neg z = q - x
   611  func (z *Element) Neg(x *Element) *Element {
   612  	if x.IsZero() {
   613  		z.SetZero()
   614  		return z
   615  	}
   616  	var borrow uint64
   617  	z[0], borrow = bits.Sub64(q0, x[0], 0)
   618  	z[1], borrow = bits.Sub64(q1, x[1], borrow)
   619  	z[2], borrow = bits.Sub64(q2, x[2], borrow)
   620  	z[3], borrow = bits.Sub64(q3, x[3], borrow)
   621  	z[4], borrow = bits.Sub64(q4, x[4], borrow)
   622  	z[5], borrow = bits.Sub64(q5, x[5], borrow)
   623  	z[6], borrow = bits.Sub64(q6, x[6], borrow)
   624  	z[7], borrow = bits.Sub64(q7, x[7], borrow)
   625  	z[8], borrow = bits.Sub64(q8, x[8], borrow)
   626  	z[9], borrow = bits.Sub64(q9, x[9], borrow)
   627  	z[10], borrow = bits.Sub64(q10, x[10], borrow)
   628  	z[11], _ = bits.Sub64(q11, x[11], borrow)
   629  	return z
   630  }
   631  
   632  // Select is a constant-time conditional move.
   633  // If c=0, z = x0. Else z = x1
   634  func (z *Element) Select(c int, x0 *Element, x1 *Element) *Element {
   635  	cC := uint64((int64(c) | -int64(c)) >> 63) // "canonicized" into: 0 if c=0, -1 otherwise
   636  	z[0] = x0[0] ^ cC&(x0[0]^x1[0])
   637  	z[1] = x0[1] ^ cC&(x0[1]^x1[1])
   638  	z[2] = x0[2] ^ cC&(x0[2]^x1[2])
   639  	z[3] = x0[3] ^ cC&(x0[3]^x1[3])
   640  	z[4] = x0[4] ^ cC&(x0[4]^x1[4])
   641  	z[5] = x0[5] ^ cC&(x0[5]^x1[5])
   642  	z[6] = x0[6] ^ cC&(x0[6]^x1[6])
   643  	z[7] = x0[7] ^ cC&(x0[7]^x1[7])
   644  	z[8] = x0[8] ^ cC&(x0[8]^x1[8])
   645  	z[9] = x0[9] ^ cC&(x0[9]^x1[9])
   646  	z[10] = x0[10] ^ cC&(x0[10]^x1[10])
   647  	z[11] = x0[11] ^ cC&(x0[11]^x1[11])
   648  	return z
   649  }
   650  
   651  // _mulGeneric is unoptimized textbook CIOS
   652  // it is a fallback solution on x86 when ADX instruction set is not available
   653  // and is used for testing purposes.
   654  func _mulGeneric(z, x, y *Element) {
   655  
   656  	// Implements CIOS multiplication -- section 2.3.2 of Tolga Acar's thesis
   657  	// https://www.microsoft.com/en-us/research/wp-content/uploads/1998/06/97Acar.pdf
   658  	//
   659  	// The algorithm:
   660  	//
   661  	// for i=0 to N-1
   662  	// 		C := 0
   663  	// 		for j=0 to N-1
   664  	// 			(C,t[j]) := t[j] + x[j]*y[i] + C
   665  	// 		(t[N+1],t[N]) := t[N] + C
   666  	//
   667  	// 		C := 0
   668  	// 		m := t[0]*q'[0] mod D
   669  	// 		(C,_) := t[0] + m*q[0]
   670  	// 		for j=1 to N-1
   671  	// 			(C,t[j-1]) := t[j] + m*q[j] + C
   672  	//
   673  	// 		(C,t[N-1]) := t[N] + C
   674  	// 		t[N] := t[N+1] + C
   675  	//
   676  	// → N is the number of machine words needed to store the modulus q
   677  	// → D is the word size. For example, on a 64-bit architecture D is 2	64
   678  	// → x[i], y[i], q[i] is the ith word of the numbers x,y,q
   679  	// → q'[0] is the lowest word of the number -q⁻¹ mod r. This quantity is pre-computed, as it does not depend on the inputs.
   680  	// → t is a temporary array of size N+2
   681  	// → C, S are machine words. A pair (C,S) refers to (hi-bits, lo-bits) of a two-word number
   682  
   683  	var t [13]uint64
   684  	var D uint64
   685  	var m, C uint64
   686  	// -----------------------------------
   687  	// First loop
   688  
   689  	C, t[0] = bits.Mul64(y[0], x[0])
   690  	C, t[1] = madd1(y[0], x[1], C)
   691  	C, t[2] = madd1(y[0], x[2], C)
   692  	C, t[3] = madd1(y[0], x[3], C)
   693  	C, t[4] = madd1(y[0], x[4], C)
   694  	C, t[5] = madd1(y[0], x[5], C)
   695  	C, t[6] = madd1(y[0], x[6], C)
   696  	C, t[7] = madd1(y[0], x[7], C)
   697  	C, t[8] = madd1(y[0], x[8], C)
   698  	C, t[9] = madd1(y[0], x[9], C)
   699  	C, t[10] = madd1(y[0], x[10], C)
   700  	C, t[11] = madd1(y[0], x[11], C)
   701  
   702  	t[12], D = bits.Add64(t[12], C, 0)
   703  
   704  	// m = t[0]n'[0] mod W
   705  	m = t[0] * qInvNeg
   706  
   707  	// -----------------------------------
   708  	// Second loop
   709  	C = madd0(m, q0, t[0])
   710  	C, t[0] = madd2(m, q1, t[1], C)
   711  	C, t[1] = madd2(m, q2, t[2], C)
   712  	C, t[2] = madd2(m, q3, t[3], C)
   713  	C, t[3] = madd2(m, q4, t[4], C)
   714  	C, t[4] = madd2(m, q5, t[5], C)
   715  	C, t[5] = madd2(m, q6, t[6], C)
   716  	C, t[6] = madd2(m, q7, t[7], C)
   717  	C, t[7] = madd2(m, q8, t[8], C)
   718  	C, t[8] = madd2(m, q9, t[9], C)
   719  	C, t[9] = madd2(m, q10, t[10], C)
   720  	C, t[10] = madd2(m, q11, t[11], C)
   721  
   722  	t[11], C = bits.Add64(t[12], C, 0)
   723  	t[12], _ = bits.Add64(0, D, C)
   724  	// -----------------------------------
   725  	// First loop
   726  
   727  	C, t[0] = madd1(y[1], x[0], t[0])
   728  	C, t[1] = madd2(y[1], x[1], t[1], C)
   729  	C, t[2] = madd2(y[1], x[2], t[2], C)
   730  	C, t[3] = madd2(y[1], x[3], t[3], C)
   731  	C, t[4] = madd2(y[1], x[4], t[4], C)
   732  	C, t[5] = madd2(y[1], x[5], t[5], C)
   733  	C, t[6] = madd2(y[1], x[6], t[6], C)
   734  	C, t[7] = madd2(y[1], x[7], t[7], C)
   735  	C, t[8] = madd2(y[1], x[8], t[8], C)
   736  	C, t[9] = madd2(y[1], x[9], t[9], C)
   737  	C, t[10] = madd2(y[1], x[10], t[10], C)
   738  	C, t[11] = madd2(y[1], x[11], t[11], C)
   739  
   740  	t[12], D = bits.Add64(t[12], C, 0)
   741  
   742  	// m = t[0]n'[0] mod W
   743  	m = t[0] * qInvNeg
   744  
   745  	// -----------------------------------
   746  	// Second loop
   747  	C = madd0(m, q0, t[0])
   748  	C, t[0] = madd2(m, q1, t[1], C)
   749  	C, t[1] = madd2(m, q2, t[2], C)
   750  	C, t[2] = madd2(m, q3, t[3], C)
   751  	C, t[3] = madd2(m, q4, t[4], C)
   752  	C, t[4] = madd2(m, q5, t[5], C)
   753  	C, t[5] = madd2(m, q6, t[6], C)
   754  	C, t[6] = madd2(m, q7, t[7], C)
   755  	C, t[7] = madd2(m, q8, t[8], C)
   756  	C, t[8] = madd2(m, q9, t[9], C)
   757  	C, t[9] = madd2(m, q10, t[10], C)
   758  	C, t[10] = madd2(m, q11, t[11], C)
   759  
   760  	t[11], C = bits.Add64(t[12], C, 0)
   761  	t[12], _ = bits.Add64(0, D, C)
   762  	// -----------------------------------
   763  	// First loop
   764  
   765  	C, t[0] = madd1(y[2], x[0], t[0])
   766  	C, t[1] = madd2(y[2], x[1], t[1], C)
   767  	C, t[2] = madd2(y[2], x[2], t[2], C)
   768  	C, t[3] = madd2(y[2], x[3], t[3], C)
   769  	C, t[4] = madd2(y[2], x[4], t[4], C)
   770  	C, t[5] = madd2(y[2], x[5], t[5], C)
   771  	C, t[6] = madd2(y[2], x[6], t[6], C)
   772  	C, t[7] = madd2(y[2], x[7], t[7], C)
   773  	C, t[8] = madd2(y[2], x[8], t[8], C)
   774  	C, t[9] = madd2(y[2], x[9], t[9], C)
   775  	C, t[10] = madd2(y[2], x[10], t[10], C)
   776  	C, t[11] = madd2(y[2], x[11], t[11], C)
   777  
   778  	t[12], D = bits.Add64(t[12], C, 0)
   779  
   780  	// m = t[0]n'[0] mod W
   781  	m = t[0] * qInvNeg
   782  
   783  	// -----------------------------------
   784  	// Second loop
   785  	C = madd0(m, q0, t[0])
   786  	C, t[0] = madd2(m, q1, t[1], C)
   787  	C, t[1] = madd2(m, q2, t[2], C)
   788  	C, t[2] = madd2(m, q3, t[3], C)
   789  	C, t[3] = madd2(m, q4, t[4], C)
   790  	C, t[4] = madd2(m, q5, t[5], C)
   791  	C, t[5] = madd2(m, q6, t[6], C)
   792  	C, t[6] = madd2(m, q7, t[7], C)
   793  	C, t[7] = madd2(m, q8, t[8], C)
   794  	C, t[8] = madd2(m, q9, t[9], C)
   795  	C, t[9] = madd2(m, q10, t[10], C)
   796  	C, t[10] = madd2(m, q11, t[11], C)
   797  
   798  	t[11], C = bits.Add64(t[12], C, 0)
   799  	t[12], _ = bits.Add64(0, D, C)
   800  	// -----------------------------------
   801  	// First loop
   802  
   803  	C, t[0] = madd1(y[3], x[0], t[0])
   804  	C, t[1] = madd2(y[3], x[1], t[1], C)
   805  	C, t[2] = madd2(y[3], x[2], t[2], C)
   806  	C, t[3] = madd2(y[3], x[3], t[3], C)
   807  	C, t[4] = madd2(y[3], x[4], t[4], C)
   808  	C, t[5] = madd2(y[3], x[5], t[5], C)
   809  	C, t[6] = madd2(y[3], x[6], t[6], C)
   810  	C, t[7] = madd2(y[3], x[7], t[7], C)
   811  	C, t[8] = madd2(y[3], x[8], t[8], C)
   812  	C, t[9] = madd2(y[3], x[9], t[9], C)
   813  	C, t[10] = madd2(y[3], x[10], t[10], C)
   814  	C, t[11] = madd2(y[3], x[11], t[11], C)
   815  
   816  	t[12], D = bits.Add64(t[12], C, 0)
   817  
   818  	// m = t[0]n'[0] mod W
   819  	m = t[0] * qInvNeg
   820  
   821  	// -----------------------------------
   822  	// Second loop
   823  	C = madd0(m, q0, t[0])
   824  	C, t[0] = madd2(m, q1, t[1], C)
   825  	C, t[1] = madd2(m, q2, t[2], C)
   826  	C, t[2] = madd2(m, q3, t[3], C)
   827  	C, t[3] = madd2(m, q4, t[4], C)
   828  	C, t[4] = madd2(m, q5, t[5], C)
   829  	C, t[5] = madd2(m, q6, t[6], C)
   830  	C, t[6] = madd2(m, q7, t[7], C)
   831  	C, t[7] = madd2(m, q8, t[8], C)
   832  	C, t[8] = madd2(m, q9, t[9], C)
   833  	C, t[9] = madd2(m, q10, t[10], C)
   834  	C, t[10] = madd2(m, q11, t[11], C)
   835  
   836  	t[11], C = bits.Add64(t[12], C, 0)
   837  	t[12], _ = bits.Add64(0, D, C)
   838  	// -----------------------------------
   839  	// First loop
   840  
   841  	C, t[0] = madd1(y[4], x[0], t[0])
   842  	C, t[1] = madd2(y[4], x[1], t[1], C)
   843  	C, t[2] = madd2(y[4], x[2], t[2], C)
   844  	C, t[3] = madd2(y[4], x[3], t[3], C)
   845  	C, t[4] = madd2(y[4], x[4], t[4], C)
   846  	C, t[5] = madd2(y[4], x[5], t[5], C)
   847  	C, t[6] = madd2(y[4], x[6], t[6], C)
   848  	C, t[7] = madd2(y[4], x[7], t[7], C)
   849  	C, t[8] = madd2(y[4], x[8], t[8], C)
   850  	C, t[9] = madd2(y[4], x[9], t[9], C)
   851  	C, t[10] = madd2(y[4], x[10], t[10], C)
   852  	C, t[11] = madd2(y[4], x[11], t[11], C)
   853  
   854  	t[12], D = bits.Add64(t[12], C, 0)
   855  
   856  	// m = t[0]n'[0] mod W
   857  	m = t[0] * qInvNeg
   858  
   859  	// -----------------------------------
   860  	// Second loop
   861  	C = madd0(m, q0, t[0])
   862  	C, t[0] = madd2(m, q1, t[1], C)
   863  	C, t[1] = madd2(m, q2, t[2], C)
   864  	C, t[2] = madd2(m, q3, t[3], C)
   865  	C, t[3] = madd2(m, q4, t[4], C)
   866  	C, t[4] = madd2(m, q5, t[5], C)
   867  	C, t[5] = madd2(m, q6, t[6], C)
   868  	C, t[6] = madd2(m, q7, t[7], C)
   869  	C, t[7] = madd2(m, q8, t[8], C)
   870  	C, t[8] = madd2(m, q9, t[9], C)
   871  	C, t[9] = madd2(m, q10, t[10], C)
   872  	C, t[10] = madd2(m, q11, t[11], C)
   873  
   874  	t[11], C = bits.Add64(t[12], C, 0)
   875  	t[12], _ = bits.Add64(0, D, C)
   876  	// -----------------------------------
   877  	// First loop
   878  
   879  	C, t[0] = madd1(y[5], x[0], t[0])
   880  	C, t[1] = madd2(y[5], x[1], t[1], C)
   881  	C, t[2] = madd2(y[5], x[2], t[2], C)
   882  	C, t[3] = madd2(y[5], x[3], t[3], C)
   883  	C, t[4] = madd2(y[5], x[4], t[4], C)
   884  	C, t[5] = madd2(y[5], x[5], t[5], C)
   885  	C, t[6] = madd2(y[5], x[6], t[6], C)
   886  	C, t[7] = madd2(y[5], x[7], t[7], C)
   887  	C, t[8] = madd2(y[5], x[8], t[8], C)
   888  	C, t[9] = madd2(y[5], x[9], t[9], C)
   889  	C, t[10] = madd2(y[5], x[10], t[10], C)
   890  	C, t[11] = madd2(y[5], x[11], t[11], C)
   891  
   892  	t[12], D = bits.Add64(t[12], C, 0)
   893  
   894  	// m = t[0]n'[0] mod W
   895  	m = t[0] * qInvNeg
   896  
   897  	// -----------------------------------
   898  	// Second loop
   899  	C = madd0(m, q0, t[0])
   900  	C, t[0] = madd2(m, q1, t[1], C)
   901  	C, t[1] = madd2(m, q2, t[2], C)
   902  	C, t[2] = madd2(m, q3, t[3], C)
   903  	C, t[3] = madd2(m, q4, t[4], C)
   904  	C, t[4] = madd2(m, q5, t[5], C)
   905  	C, t[5] = madd2(m, q6, t[6], C)
   906  	C, t[6] = madd2(m, q7, t[7], C)
   907  	C, t[7] = madd2(m, q8, t[8], C)
   908  	C, t[8] = madd2(m, q9, t[9], C)
   909  	C, t[9] = madd2(m, q10, t[10], C)
   910  	C, t[10] = madd2(m, q11, t[11], C)
   911  
   912  	t[11], C = bits.Add64(t[12], C, 0)
   913  	t[12], _ = bits.Add64(0, D, C)
   914  	// -----------------------------------
   915  	// First loop
   916  
   917  	C, t[0] = madd1(y[6], x[0], t[0])
   918  	C, t[1] = madd2(y[6], x[1], t[1], C)
   919  	C, t[2] = madd2(y[6], x[2], t[2], C)
   920  	C, t[3] = madd2(y[6], x[3], t[3], C)
   921  	C, t[4] = madd2(y[6], x[4], t[4], C)
   922  	C, t[5] = madd2(y[6], x[5], t[5], C)
   923  	C, t[6] = madd2(y[6], x[6], t[6], C)
   924  	C, t[7] = madd2(y[6], x[7], t[7], C)
   925  	C, t[8] = madd2(y[6], x[8], t[8], C)
   926  	C, t[9] = madd2(y[6], x[9], t[9], C)
   927  	C, t[10] = madd2(y[6], x[10], t[10], C)
   928  	C, t[11] = madd2(y[6], x[11], t[11], C)
   929  
   930  	t[12], D = bits.Add64(t[12], C, 0)
   931  
   932  	// m = t[0]n'[0] mod W
   933  	m = t[0] * qInvNeg
   934  
   935  	// -----------------------------------
   936  	// Second loop
   937  	C = madd0(m, q0, t[0])
   938  	C, t[0] = madd2(m, q1, t[1], C)
   939  	C, t[1] = madd2(m, q2, t[2], C)
   940  	C, t[2] = madd2(m, q3, t[3], C)
   941  	C, t[3] = madd2(m, q4, t[4], C)
   942  	C, t[4] = madd2(m, q5, t[5], C)
   943  	C, t[5] = madd2(m, q6, t[6], C)
   944  	C, t[6] = madd2(m, q7, t[7], C)
   945  	C, t[7] = madd2(m, q8, t[8], C)
   946  	C, t[8] = madd2(m, q9, t[9], C)
   947  	C, t[9] = madd2(m, q10, t[10], C)
   948  	C, t[10] = madd2(m, q11, t[11], C)
   949  
   950  	t[11], C = bits.Add64(t[12], C, 0)
   951  	t[12], _ = bits.Add64(0, D, C)
   952  	// -----------------------------------
   953  	// First loop
   954  
   955  	C, t[0] = madd1(y[7], x[0], t[0])
   956  	C, t[1] = madd2(y[7], x[1], t[1], C)
   957  	C, t[2] = madd2(y[7], x[2], t[2], C)
   958  	C, t[3] = madd2(y[7], x[3], t[3], C)
   959  	C, t[4] = madd2(y[7], x[4], t[4], C)
   960  	C, t[5] = madd2(y[7], x[5], t[5], C)
   961  	C, t[6] = madd2(y[7], x[6], t[6], C)
   962  	C, t[7] = madd2(y[7], x[7], t[7], C)
   963  	C, t[8] = madd2(y[7], x[8], t[8], C)
   964  	C, t[9] = madd2(y[7], x[9], t[9], C)
   965  	C, t[10] = madd2(y[7], x[10], t[10], C)
   966  	C, t[11] = madd2(y[7], x[11], t[11], C)
   967  
   968  	t[12], D = bits.Add64(t[12], C, 0)
   969  
   970  	// m = t[0]n'[0] mod W
   971  	m = t[0] * qInvNeg
   972  
   973  	// -----------------------------------
   974  	// Second loop
   975  	C = madd0(m, q0, t[0])
   976  	C, t[0] = madd2(m, q1, t[1], C)
   977  	C, t[1] = madd2(m, q2, t[2], C)
   978  	C, t[2] = madd2(m, q3, t[3], C)
   979  	C, t[3] = madd2(m, q4, t[4], C)
   980  	C, t[4] = madd2(m, q5, t[5], C)
   981  	C, t[5] = madd2(m, q6, t[6], C)
   982  	C, t[6] = madd2(m, q7, t[7], C)
   983  	C, t[7] = madd2(m, q8, t[8], C)
   984  	C, t[8] = madd2(m, q9, t[9], C)
   985  	C, t[9] = madd2(m, q10, t[10], C)
   986  	C, t[10] = madd2(m, q11, t[11], C)
   987  
   988  	t[11], C = bits.Add64(t[12], C, 0)
   989  	t[12], _ = bits.Add64(0, D, C)
   990  	// -----------------------------------
   991  	// First loop
   992  
   993  	C, t[0] = madd1(y[8], x[0], t[0])
   994  	C, t[1] = madd2(y[8], x[1], t[1], C)
   995  	C, t[2] = madd2(y[8], x[2], t[2], C)
   996  	C, t[3] = madd2(y[8], x[3], t[3], C)
   997  	C, t[4] = madd2(y[8], x[4], t[4], C)
   998  	C, t[5] = madd2(y[8], x[5], t[5], C)
   999  	C, t[6] = madd2(y[8], x[6], t[6], C)
  1000  	C, t[7] = madd2(y[8], x[7], t[7], C)
  1001  	C, t[8] = madd2(y[8], x[8], t[8], C)
  1002  	C, t[9] = madd2(y[8], x[9], t[9], C)
  1003  	C, t[10] = madd2(y[8], x[10], t[10], C)
  1004  	C, t[11] = madd2(y[8], x[11], t[11], C)
  1005  
  1006  	t[12], D = bits.Add64(t[12], C, 0)
  1007  
  1008  	// m = t[0]n'[0] mod W
  1009  	m = t[0] * qInvNeg
  1010  
  1011  	// -----------------------------------
  1012  	// Second loop
  1013  	C = madd0(m, q0, t[0])
  1014  	C, t[0] = madd2(m, q1, t[1], C)
  1015  	C, t[1] = madd2(m, q2, t[2], C)
  1016  	C, t[2] = madd2(m, q3, t[3], C)
  1017  	C, t[3] = madd2(m, q4, t[4], C)
  1018  	C, t[4] = madd2(m, q5, t[5], C)
  1019  	C, t[5] = madd2(m, q6, t[6], C)
  1020  	C, t[6] = madd2(m, q7, t[7], C)
  1021  	C, t[7] = madd2(m, q8, t[8], C)
  1022  	C, t[8] = madd2(m, q9, t[9], C)
  1023  	C, t[9] = madd2(m, q10, t[10], C)
  1024  	C, t[10] = madd2(m, q11, t[11], C)
  1025  
  1026  	t[11], C = bits.Add64(t[12], C, 0)
  1027  	t[12], _ = bits.Add64(0, D, C)
  1028  	// -----------------------------------
  1029  	// First loop
  1030  
  1031  	C, t[0] = madd1(y[9], x[0], t[0])
  1032  	C, t[1] = madd2(y[9], x[1], t[1], C)
  1033  	C, t[2] = madd2(y[9], x[2], t[2], C)
  1034  	C, t[3] = madd2(y[9], x[3], t[3], C)
  1035  	C, t[4] = madd2(y[9], x[4], t[4], C)
  1036  	C, t[5] = madd2(y[9], x[5], t[5], C)
  1037  	C, t[6] = madd2(y[9], x[6], t[6], C)
  1038  	C, t[7] = madd2(y[9], x[7], t[7], C)
  1039  	C, t[8] = madd2(y[9], x[8], t[8], C)
  1040  	C, t[9] = madd2(y[9], x[9], t[9], C)
  1041  	C, t[10] = madd2(y[9], x[10], t[10], C)
  1042  	C, t[11] = madd2(y[9], x[11], t[11], C)
  1043  
  1044  	t[12], D = bits.Add64(t[12], C, 0)
  1045  
  1046  	// m = t[0]n'[0] mod W
  1047  	m = t[0] * qInvNeg
  1048  
  1049  	// -----------------------------------
  1050  	// Second loop
  1051  	C = madd0(m, q0, t[0])
  1052  	C, t[0] = madd2(m, q1, t[1], C)
  1053  	C, t[1] = madd2(m, q2, t[2], C)
  1054  	C, t[2] = madd2(m, q3, t[3], C)
  1055  	C, t[3] = madd2(m, q4, t[4], C)
  1056  	C, t[4] = madd2(m, q5, t[5], C)
  1057  	C, t[5] = madd2(m, q6, t[6], C)
  1058  	C, t[6] = madd2(m, q7, t[7], C)
  1059  	C, t[7] = madd2(m, q8, t[8], C)
  1060  	C, t[8] = madd2(m, q9, t[9], C)
  1061  	C, t[9] = madd2(m, q10, t[10], C)
  1062  	C, t[10] = madd2(m, q11, t[11], C)
  1063  
  1064  	t[11], C = bits.Add64(t[12], C, 0)
  1065  	t[12], _ = bits.Add64(0, D, C)
  1066  	// -----------------------------------
  1067  	// First loop
  1068  
  1069  	C, t[0] = madd1(y[10], x[0], t[0])
  1070  	C, t[1] = madd2(y[10], x[1], t[1], C)
  1071  	C, t[2] = madd2(y[10], x[2], t[2], C)
  1072  	C, t[3] = madd2(y[10], x[3], t[3], C)
  1073  	C, t[4] = madd2(y[10], x[4], t[4], C)
  1074  	C, t[5] = madd2(y[10], x[5], t[5], C)
  1075  	C, t[6] = madd2(y[10], x[6], t[6], C)
  1076  	C, t[7] = madd2(y[10], x[7], t[7], C)
  1077  	C, t[8] = madd2(y[10], x[8], t[8], C)
  1078  	C, t[9] = madd2(y[10], x[9], t[9], C)
  1079  	C, t[10] = madd2(y[10], x[10], t[10], C)
  1080  	C, t[11] = madd2(y[10], x[11], t[11], C)
  1081  
  1082  	t[12], D = bits.Add64(t[12], C, 0)
  1083  
  1084  	// m = t[0]n'[0] mod W
  1085  	m = t[0] * qInvNeg
  1086  
  1087  	// -----------------------------------
  1088  	// Second loop
  1089  	C = madd0(m, q0, t[0])
  1090  	C, t[0] = madd2(m, q1, t[1], C)
  1091  	C, t[1] = madd2(m, q2, t[2], C)
  1092  	C, t[2] = madd2(m, q3, t[3], C)
  1093  	C, t[3] = madd2(m, q4, t[4], C)
  1094  	C, t[4] = madd2(m, q5, t[5], C)
  1095  	C, t[5] = madd2(m, q6, t[6], C)
  1096  	C, t[6] = madd2(m, q7, t[7], C)
  1097  	C, t[7] = madd2(m, q8, t[8], C)
  1098  	C, t[8] = madd2(m, q9, t[9], C)
  1099  	C, t[9] = madd2(m, q10, t[10], C)
  1100  	C, t[10] = madd2(m, q11, t[11], C)
  1101  
  1102  	t[11], C = bits.Add64(t[12], C, 0)
  1103  	t[12], _ = bits.Add64(0, D, C)
  1104  	// -----------------------------------
  1105  	// First loop
  1106  
  1107  	C, t[0] = madd1(y[11], x[0], t[0])
  1108  	C, t[1] = madd2(y[11], x[1], t[1], C)
  1109  	C, t[2] = madd2(y[11], x[2], t[2], C)
  1110  	C, t[3] = madd2(y[11], x[3], t[3], C)
  1111  	C, t[4] = madd2(y[11], x[4], t[4], C)
  1112  	C, t[5] = madd2(y[11], x[5], t[5], C)
  1113  	C, t[6] = madd2(y[11], x[6], t[6], C)
  1114  	C, t[7] = madd2(y[11], x[7], t[7], C)
  1115  	C, t[8] = madd2(y[11], x[8], t[8], C)
  1116  	C, t[9] = madd2(y[11], x[9], t[9], C)
  1117  	C, t[10] = madd2(y[11], x[10], t[10], C)
  1118  	C, t[11] = madd2(y[11], x[11], t[11], C)
  1119  
  1120  	t[12], D = bits.Add64(t[12], C, 0)
  1121  
  1122  	// m = t[0]n'[0] mod W
  1123  	m = t[0] * qInvNeg
  1124  
  1125  	// -----------------------------------
  1126  	// Second loop
  1127  	C = madd0(m, q0, t[0])
  1128  	C, t[0] = madd2(m, q1, t[1], C)
  1129  	C, t[1] = madd2(m, q2, t[2], C)
  1130  	C, t[2] = madd2(m, q3, t[3], C)
  1131  	C, t[3] = madd2(m, q4, t[4], C)
  1132  	C, t[4] = madd2(m, q5, t[5], C)
  1133  	C, t[5] = madd2(m, q6, t[6], C)
  1134  	C, t[6] = madd2(m, q7, t[7], C)
  1135  	C, t[7] = madd2(m, q8, t[8], C)
  1136  	C, t[8] = madd2(m, q9, t[9], C)
  1137  	C, t[9] = madd2(m, q10, t[10], C)
  1138  	C, t[10] = madd2(m, q11, t[11], C)
  1139  
  1140  	t[11], C = bits.Add64(t[12], C, 0)
  1141  	t[12], _ = bits.Add64(0, D, C)
  1142  
  1143  	if t[12] != 0 {
  1144  		// we need to reduce, we have a result on 13 words
  1145  		var b uint64
  1146  		z[0], b = bits.Sub64(t[0], q0, 0)
  1147  		z[1], b = bits.Sub64(t[1], q1, b)
  1148  		z[2], b = bits.Sub64(t[2], q2, b)
  1149  		z[3], b = bits.Sub64(t[3], q3, b)
  1150  		z[4], b = bits.Sub64(t[4], q4, b)
  1151  		z[5], b = bits.Sub64(t[5], q5, b)
  1152  		z[6], b = bits.Sub64(t[6], q6, b)
  1153  		z[7], b = bits.Sub64(t[7], q7, b)
  1154  		z[8], b = bits.Sub64(t[8], q8, b)
  1155  		z[9], b = bits.Sub64(t[9], q9, b)
  1156  		z[10], b = bits.Sub64(t[10], q10, b)
  1157  		z[11], _ = bits.Sub64(t[11], q11, b)
  1158  		return
  1159  	}
  1160  
  1161  	// copy t into z
  1162  	z[0] = t[0]
  1163  	z[1] = t[1]
  1164  	z[2] = t[2]
  1165  	z[3] = t[3]
  1166  	z[4] = t[4]
  1167  	z[5] = t[5]
  1168  	z[6] = t[6]
  1169  	z[7] = t[7]
  1170  	z[8] = t[8]
  1171  	z[9] = t[9]
  1172  	z[10] = t[10]
  1173  	z[11] = t[11]
  1174  
  1175  	// if z ⩾ q → z -= q
  1176  	if !z.smallerThanModulus() {
  1177  		var b uint64
  1178  		z[0], b = bits.Sub64(z[0], q0, 0)
  1179  		z[1], b = bits.Sub64(z[1], q1, b)
  1180  		z[2], b = bits.Sub64(z[2], q2, b)
  1181  		z[3], b = bits.Sub64(z[3], q3, b)
  1182  		z[4], b = bits.Sub64(z[4], q4, b)
  1183  		z[5], b = bits.Sub64(z[5], q5, b)
  1184  		z[6], b = bits.Sub64(z[6], q6, b)
  1185  		z[7], b = bits.Sub64(z[7], q7, b)
  1186  		z[8], b = bits.Sub64(z[8], q8, b)
  1187  		z[9], b = bits.Sub64(z[9], q9, b)
  1188  		z[10], b = bits.Sub64(z[10], q10, b)
  1189  		z[11], _ = bits.Sub64(z[11], q11, b)
  1190  	}
  1191  }
  1192  
  1193  func _fromMontGeneric(z *Element) {
  1194  	// the following lines implement z = z * 1
  1195  	// with a modified CIOS montgomery multiplication
  1196  	// see Mul for algorithm documentation
  1197  	{
  1198  		// m = z[0]n'[0] mod W
  1199  		m := z[0] * qInvNeg
  1200  		C := madd0(m, q0, z[0])
  1201  		C, z[0] = madd2(m, q1, z[1], C)
  1202  		C, z[1] = madd2(m, q2, z[2], C)
  1203  		C, z[2] = madd2(m, q3, z[3], C)
  1204  		C, z[3] = madd2(m, q4, z[4], C)
  1205  		C, z[4] = madd2(m, q5, z[5], C)
  1206  		C, z[5] = madd2(m, q6, z[6], C)
  1207  		C, z[6] = madd2(m, q7, z[7], C)
  1208  		C, z[7] = madd2(m, q8, z[8], C)
  1209  		C, z[8] = madd2(m, q9, z[9], C)
  1210  		C, z[9] = madd2(m, q10, z[10], C)
  1211  		C, z[10] = madd2(m, q11, z[11], C)
  1212  		z[11] = C
  1213  	}
  1214  	{
  1215  		// m = z[0]n'[0] mod W
  1216  		m := z[0] * qInvNeg
  1217  		C := madd0(m, q0, z[0])
  1218  		C, z[0] = madd2(m, q1, z[1], C)
  1219  		C, z[1] = madd2(m, q2, z[2], C)
  1220  		C, z[2] = madd2(m, q3, z[3], C)
  1221  		C, z[3] = madd2(m, q4, z[4], C)
  1222  		C, z[4] = madd2(m, q5, z[5], C)
  1223  		C, z[5] = madd2(m, q6, z[6], C)
  1224  		C, z[6] = madd2(m, q7, z[7], C)
  1225  		C, z[7] = madd2(m, q8, z[8], C)
  1226  		C, z[8] = madd2(m, q9, z[9], C)
  1227  		C, z[9] = madd2(m, q10, z[10], C)
  1228  		C, z[10] = madd2(m, q11, z[11], C)
  1229  		z[11] = C
  1230  	}
  1231  	{
  1232  		// m = z[0]n'[0] mod W
  1233  		m := z[0] * qInvNeg
  1234  		C := madd0(m, q0, z[0])
  1235  		C, z[0] = madd2(m, q1, z[1], C)
  1236  		C, z[1] = madd2(m, q2, z[2], C)
  1237  		C, z[2] = madd2(m, q3, z[3], C)
  1238  		C, z[3] = madd2(m, q4, z[4], C)
  1239  		C, z[4] = madd2(m, q5, z[5], C)
  1240  		C, z[5] = madd2(m, q6, z[6], C)
  1241  		C, z[6] = madd2(m, q7, z[7], C)
  1242  		C, z[7] = madd2(m, q8, z[8], C)
  1243  		C, z[8] = madd2(m, q9, z[9], C)
  1244  		C, z[9] = madd2(m, q10, z[10], C)
  1245  		C, z[10] = madd2(m, q11, z[11], C)
  1246  		z[11] = C
  1247  	}
  1248  	{
  1249  		// m = z[0]n'[0] mod W
  1250  		m := z[0] * qInvNeg
  1251  		C := madd0(m, q0, z[0])
  1252  		C, z[0] = madd2(m, q1, z[1], C)
  1253  		C, z[1] = madd2(m, q2, z[2], C)
  1254  		C, z[2] = madd2(m, q3, z[3], C)
  1255  		C, z[3] = madd2(m, q4, z[4], C)
  1256  		C, z[4] = madd2(m, q5, z[5], C)
  1257  		C, z[5] = madd2(m, q6, z[6], C)
  1258  		C, z[6] = madd2(m, q7, z[7], C)
  1259  		C, z[7] = madd2(m, q8, z[8], C)
  1260  		C, z[8] = madd2(m, q9, z[9], C)
  1261  		C, z[9] = madd2(m, q10, z[10], C)
  1262  		C, z[10] = madd2(m, q11, z[11], C)
  1263  		z[11] = C
  1264  	}
  1265  	{
  1266  		// m = z[0]n'[0] mod W
  1267  		m := z[0] * qInvNeg
  1268  		C := madd0(m, q0, z[0])
  1269  		C, z[0] = madd2(m, q1, z[1], C)
  1270  		C, z[1] = madd2(m, q2, z[2], C)
  1271  		C, z[2] = madd2(m, q3, z[3], C)
  1272  		C, z[3] = madd2(m, q4, z[4], C)
  1273  		C, z[4] = madd2(m, q5, z[5], C)
  1274  		C, z[5] = madd2(m, q6, z[6], C)
  1275  		C, z[6] = madd2(m, q7, z[7], C)
  1276  		C, z[7] = madd2(m, q8, z[8], C)
  1277  		C, z[8] = madd2(m, q9, z[9], C)
  1278  		C, z[9] = madd2(m, q10, z[10], C)
  1279  		C, z[10] = madd2(m, q11, z[11], C)
  1280  		z[11] = C
  1281  	}
  1282  	{
  1283  		// m = z[0]n'[0] mod W
  1284  		m := z[0] * qInvNeg
  1285  		C := madd0(m, q0, z[0])
  1286  		C, z[0] = madd2(m, q1, z[1], C)
  1287  		C, z[1] = madd2(m, q2, z[2], C)
  1288  		C, z[2] = madd2(m, q3, z[3], C)
  1289  		C, z[3] = madd2(m, q4, z[4], C)
  1290  		C, z[4] = madd2(m, q5, z[5], C)
  1291  		C, z[5] = madd2(m, q6, z[6], C)
  1292  		C, z[6] = madd2(m, q7, z[7], C)
  1293  		C, z[7] = madd2(m, q8, z[8], C)
  1294  		C, z[8] = madd2(m, q9, z[9], C)
  1295  		C, z[9] = madd2(m, q10, z[10], C)
  1296  		C, z[10] = madd2(m, q11, z[11], C)
  1297  		z[11] = C
  1298  	}
  1299  	{
  1300  		// m = z[0]n'[0] mod W
  1301  		m := z[0] * qInvNeg
  1302  		C := madd0(m, q0, z[0])
  1303  		C, z[0] = madd2(m, q1, z[1], C)
  1304  		C, z[1] = madd2(m, q2, z[2], C)
  1305  		C, z[2] = madd2(m, q3, z[3], C)
  1306  		C, z[3] = madd2(m, q4, z[4], C)
  1307  		C, z[4] = madd2(m, q5, z[5], C)
  1308  		C, z[5] = madd2(m, q6, z[6], C)
  1309  		C, z[6] = madd2(m, q7, z[7], C)
  1310  		C, z[7] = madd2(m, q8, z[8], C)
  1311  		C, z[8] = madd2(m, q9, z[9], C)
  1312  		C, z[9] = madd2(m, q10, z[10], C)
  1313  		C, z[10] = madd2(m, q11, z[11], C)
  1314  		z[11] = C
  1315  	}
  1316  	{
  1317  		// m = z[0]n'[0] mod W
  1318  		m := z[0] * qInvNeg
  1319  		C := madd0(m, q0, z[0])
  1320  		C, z[0] = madd2(m, q1, z[1], C)
  1321  		C, z[1] = madd2(m, q2, z[2], C)
  1322  		C, z[2] = madd2(m, q3, z[3], C)
  1323  		C, z[3] = madd2(m, q4, z[4], C)
  1324  		C, z[4] = madd2(m, q5, z[5], C)
  1325  		C, z[5] = madd2(m, q6, z[6], C)
  1326  		C, z[6] = madd2(m, q7, z[7], C)
  1327  		C, z[7] = madd2(m, q8, z[8], C)
  1328  		C, z[8] = madd2(m, q9, z[9], C)
  1329  		C, z[9] = madd2(m, q10, z[10], C)
  1330  		C, z[10] = madd2(m, q11, z[11], C)
  1331  		z[11] = C
  1332  	}
  1333  	{
  1334  		// m = z[0]n'[0] mod W
  1335  		m := z[0] * qInvNeg
  1336  		C := madd0(m, q0, z[0])
  1337  		C, z[0] = madd2(m, q1, z[1], C)
  1338  		C, z[1] = madd2(m, q2, z[2], C)
  1339  		C, z[2] = madd2(m, q3, z[3], C)
  1340  		C, z[3] = madd2(m, q4, z[4], C)
  1341  		C, z[4] = madd2(m, q5, z[5], C)
  1342  		C, z[5] = madd2(m, q6, z[6], C)
  1343  		C, z[6] = madd2(m, q7, z[7], C)
  1344  		C, z[7] = madd2(m, q8, z[8], C)
  1345  		C, z[8] = madd2(m, q9, z[9], C)
  1346  		C, z[9] = madd2(m, q10, z[10], C)
  1347  		C, z[10] = madd2(m, q11, z[11], C)
  1348  		z[11] = C
  1349  	}
  1350  	{
  1351  		// m = z[0]n'[0] mod W
  1352  		m := z[0] * qInvNeg
  1353  		C := madd0(m, q0, z[0])
  1354  		C, z[0] = madd2(m, q1, z[1], C)
  1355  		C, z[1] = madd2(m, q2, z[2], C)
  1356  		C, z[2] = madd2(m, q3, z[3], C)
  1357  		C, z[3] = madd2(m, q4, z[4], C)
  1358  		C, z[4] = madd2(m, q5, z[5], C)
  1359  		C, z[5] = madd2(m, q6, z[6], C)
  1360  		C, z[6] = madd2(m, q7, z[7], C)
  1361  		C, z[7] = madd2(m, q8, z[8], C)
  1362  		C, z[8] = madd2(m, q9, z[9], C)
  1363  		C, z[9] = madd2(m, q10, z[10], C)
  1364  		C, z[10] = madd2(m, q11, z[11], C)
  1365  		z[11] = C
  1366  	}
  1367  	{
  1368  		// m = z[0]n'[0] mod W
  1369  		m := z[0] * qInvNeg
  1370  		C := madd0(m, q0, z[0])
  1371  		C, z[0] = madd2(m, q1, z[1], C)
  1372  		C, z[1] = madd2(m, q2, z[2], C)
  1373  		C, z[2] = madd2(m, q3, z[3], C)
  1374  		C, z[3] = madd2(m, q4, z[4], C)
  1375  		C, z[4] = madd2(m, q5, z[5], C)
  1376  		C, z[5] = madd2(m, q6, z[6], C)
  1377  		C, z[6] = madd2(m, q7, z[7], C)
  1378  		C, z[7] = madd2(m, q8, z[8], C)
  1379  		C, z[8] = madd2(m, q9, z[9], C)
  1380  		C, z[9] = madd2(m, q10, z[10], C)
  1381  		C, z[10] = madd2(m, q11, z[11], C)
  1382  		z[11] = C
  1383  	}
  1384  	{
  1385  		// m = z[0]n'[0] mod W
  1386  		m := z[0] * qInvNeg
  1387  		C := madd0(m, q0, z[0])
  1388  		C, z[0] = madd2(m, q1, z[1], C)
  1389  		C, z[1] = madd2(m, q2, z[2], C)
  1390  		C, z[2] = madd2(m, q3, z[3], C)
  1391  		C, z[3] = madd2(m, q4, z[4], C)
  1392  		C, z[4] = madd2(m, q5, z[5], C)
  1393  		C, z[5] = madd2(m, q6, z[6], C)
  1394  		C, z[6] = madd2(m, q7, z[7], C)
  1395  		C, z[7] = madd2(m, q8, z[8], C)
  1396  		C, z[8] = madd2(m, q9, z[9], C)
  1397  		C, z[9] = madd2(m, q10, z[10], C)
  1398  		C, z[10] = madd2(m, q11, z[11], C)
  1399  		z[11] = C
  1400  	}
  1401  
  1402  	// if z ⩾ q → z -= q
  1403  	if !z.smallerThanModulus() {
  1404  		var b uint64
  1405  		z[0], b = bits.Sub64(z[0], q0, 0)
  1406  		z[1], b = bits.Sub64(z[1], q1, b)
  1407  		z[2], b = bits.Sub64(z[2], q2, b)
  1408  		z[3], b = bits.Sub64(z[3], q3, b)
  1409  		z[4], b = bits.Sub64(z[4], q4, b)
  1410  		z[5], b = bits.Sub64(z[5], q5, b)
  1411  		z[6], b = bits.Sub64(z[6], q6, b)
  1412  		z[7], b = bits.Sub64(z[7], q7, b)
  1413  		z[8], b = bits.Sub64(z[8], q8, b)
  1414  		z[9], b = bits.Sub64(z[9], q9, b)
  1415  		z[10], b = bits.Sub64(z[10], q10, b)
  1416  		z[11], _ = bits.Sub64(z[11], q11, b)
  1417  	}
  1418  }
  1419  
  1420  func _reduceGeneric(z *Element) {
  1421  
  1422  	// if z ⩾ q → z -= q
  1423  	if !z.smallerThanModulus() {
  1424  		var b uint64
  1425  		z[0], b = bits.Sub64(z[0], q0, 0)
  1426  		z[1], b = bits.Sub64(z[1], q1, b)
  1427  		z[2], b = bits.Sub64(z[2], q2, b)
  1428  		z[3], b = bits.Sub64(z[3], q3, b)
  1429  		z[4], b = bits.Sub64(z[4], q4, b)
  1430  		z[5], b = bits.Sub64(z[5], q5, b)
  1431  		z[6], b = bits.Sub64(z[6], q6, b)
  1432  		z[7], b = bits.Sub64(z[7], q7, b)
  1433  		z[8], b = bits.Sub64(z[8], q8, b)
  1434  		z[9], b = bits.Sub64(z[9], q9, b)
  1435  		z[10], b = bits.Sub64(z[10], q10, b)
  1436  		z[11], _ = bits.Sub64(z[11], q11, b)
  1437  	}
  1438  }
  1439  
  1440  // BatchInvert returns a new slice with every element inverted.
  1441  // Uses Montgomery batch inversion trick
  1442  func BatchInvert(a []Element) []Element {
  1443  	res := make([]Element, len(a))
  1444  	if len(a) == 0 {
  1445  		return res
  1446  	}
  1447  
  1448  	zeroes := bitset.New(uint(len(a)))
  1449  	accumulator := One()
  1450  
  1451  	for i := 0; i < len(a); i++ {
  1452  		if a[i].IsZero() {
  1453  			zeroes.Set(uint(i))
  1454  			continue
  1455  		}
  1456  		res[i] = accumulator
  1457  		accumulator.Mul(&accumulator, &a[i])
  1458  	}
  1459  
  1460  	accumulator.Inverse(&accumulator)
  1461  
  1462  	for i := len(a) - 1; i >= 0; i-- {
  1463  		if zeroes.Test(uint(i)) {
  1464  			continue
  1465  		}
  1466  		res[i].Mul(&res[i], &accumulator)
  1467  		accumulator.Mul(&accumulator, &a[i])
  1468  	}
  1469  
  1470  	return res
  1471  }
  1472  
  1473  func _butterflyGeneric(a, b *Element) {
  1474  	t := *a
  1475  	a.Add(a, b)
  1476  	b.Sub(&t, b)
  1477  }
  1478  
  1479  // BitLen returns the minimum number of bits needed to represent z
  1480  // returns 0 if z == 0
  1481  func (z *Element) BitLen() int {
  1482  	if z[11] != 0 {
  1483  		return 704 + bits.Len64(z[11])
  1484  	}
  1485  	if z[10] != 0 {
  1486  		return 640 + bits.Len64(z[10])
  1487  	}
  1488  	if z[9] != 0 {
  1489  		return 576 + bits.Len64(z[9])
  1490  	}
  1491  	if z[8] != 0 {
  1492  		return 512 + bits.Len64(z[8])
  1493  	}
  1494  	if z[7] != 0 {
  1495  		return 448 + bits.Len64(z[7])
  1496  	}
  1497  	if z[6] != 0 {
  1498  		return 384 + bits.Len64(z[6])
  1499  	}
  1500  	if z[5] != 0 {
  1501  		return 320 + bits.Len64(z[5])
  1502  	}
  1503  	if z[4] != 0 {
  1504  		return 256 + bits.Len64(z[4])
  1505  	}
  1506  	if z[3] != 0 {
  1507  		return 192 + bits.Len64(z[3])
  1508  	}
  1509  	if z[2] != 0 {
  1510  		return 128 + bits.Len64(z[2])
  1511  	}
  1512  	if z[1] != 0 {
  1513  		return 64 + bits.Len64(z[1])
  1514  	}
  1515  	return bits.Len64(z[0])
  1516  }
  1517  
  1518  // Hash msg to count prime field elements.
  1519  // https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06#section-5.2
  1520  func Hash(msg, dst []byte, count int) ([]Element, error) {
  1521  	// 128 bits of security
  1522  	// L = ceil((ceil(log2(p)) + k) / 8), where k is the security parameter = 128
  1523  	const Bytes = 1 + (Bits-1)/8
  1524  	const L = 16 + Bytes
  1525  
  1526  	lenInBytes := count * L
  1527  	pseudoRandomBytes, err := hash.ExpandMsgXmd(msg, dst, lenInBytes)
  1528  	if err != nil {
  1529  		return nil, err
  1530  	}
  1531  
  1532  	// get temporary big int from the pool
  1533  	vv := pool.BigInt.Get()
  1534  
  1535  	res := make([]Element, count)
  1536  	for i := 0; i < count; i++ {
  1537  		vv.SetBytes(pseudoRandomBytes[i*L : (i+1)*L])
  1538  		res[i].SetBigInt(vv)
  1539  	}
  1540  
  1541  	// release object into pool
  1542  	pool.BigInt.Put(vv)
  1543  
  1544  	return res, nil
  1545  }
  1546  
  1547  // Exp z = xᵏ (mod q)
  1548  func (z *Element) Exp(x Element, k *big.Int) *Element {
  1549  	if k.IsUint64() && k.Uint64() == 0 {
  1550  		return z.SetOne()
  1551  	}
  1552  
  1553  	e := k
  1554  	if k.Sign() == -1 {
  1555  		// negative k, we invert
  1556  		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
  1557  		x.Inverse(&x)
  1558  
  1559  		// we negate k in a temp big.Int since
  1560  		// Int.Bit(_) of k and -k is different
  1561  		e = pool.BigInt.Get()
  1562  		defer pool.BigInt.Put(e)
  1563  		e.Neg(k)
  1564  	}
  1565  
  1566  	z.Set(&x)
  1567  
  1568  	for i := e.BitLen() - 2; i >= 0; i-- {
  1569  		z.Square(z)
  1570  		if e.Bit(i) == 1 {
  1571  			z.Mul(z, &x)
  1572  		}
  1573  	}
  1574  
  1575  	return z
  1576  }
  1577  
  1578  // rSquare where r is the Montgommery constant
  1579  // see section 2.3.2 of Tolga Acar's thesis
  1580  // https://www.microsoft.com/en-us/research/wp-content/uploads/1998/06/97Acar.pdf
  1581  var rSquare = Element{
  1582  	14305184132582319705,
  1583  	8868935336694416555,
  1584  	9196887162930508889,
  1585  	15486798265448570248,
  1586  	5402985275949444416,
  1587  	10893197322525159598,
  1588  	3204916688966998390,
  1589  	12417238192559061753,
  1590  	12426306557607898622,
  1591  	1305582522441154384,
  1592  	10311846026977660324,
  1593  	48736111365249031,
  1594  }
  1595  
  1596  // toMont converts z to Montgomery form
  1597  // sets and returns z = z * r²
  1598  func (z *Element) toMont() *Element {
  1599  	return z.Mul(z, &rSquare)
  1600  }
  1601  
  1602  // String returns the decimal representation of z as generated by
  1603  // z.Text(10).
  1604  func (z *Element) String() string {
  1605  	return z.Text(10)
  1606  }
  1607  
  1608  // toBigInt returns z as a big.Int in Montgomery form
  1609  func (z *Element) toBigInt(res *big.Int) *big.Int {
  1610  	var b [Bytes]byte
  1611  	binary.BigEndian.PutUint64(b[88:96], z[0])
  1612  	binary.BigEndian.PutUint64(b[80:88], z[1])
  1613  	binary.BigEndian.PutUint64(b[72:80], z[2])
  1614  	binary.BigEndian.PutUint64(b[64:72], z[3])
  1615  	binary.BigEndian.PutUint64(b[56:64], z[4])
  1616  	binary.BigEndian.PutUint64(b[48:56], z[5])
  1617  	binary.BigEndian.PutUint64(b[40:48], z[6])
  1618  	binary.BigEndian.PutUint64(b[32:40], z[7])
  1619  	binary.BigEndian.PutUint64(b[24:32], z[8])
  1620  	binary.BigEndian.PutUint64(b[16:24], z[9])
  1621  	binary.BigEndian.PutUint64(b[8:16], z[10])
  1622  	binary.BigEndian.PutUint64(b[0:8], z[11])
  1623  
  1624  	return res.SetBytes(b[:])
  1625  }
  1626  
  1627  // Text returns the string representation of z in the given base.
  1628  // Base must be between 2 and 36, inclusive. The result uses the
  1629  // lower-case letters 'a' to 'z' for digit values 10 to 35.
  1630  // No prefix (such as "0x") is added to the string. If z is a nil
  1631  // pointer it returns "<nil>".
  1632  // If base == 10 and -z fits in a uint16 prefix "-" is added to the string.
  1633  func (z *Element) Text(base int) string {
  1634  	if base < 2 || base > 36 {
  1635  		panic("invalid base")
  1636  	}
  1637  	if z == nil {
  1638  		return "<nil>"
  1639  	}
  1640  
  1641  	const maxUint16 = 65535
  1642  	if base == 10 {
  1643  		var zzNeg Element
  1644  		zzNeg.Neg(z)
  1645  		zzNeg.fromMont()
  1646  		if zzNeg.FitsOnOneWord() && zzNeg[0] <= maxUint16 && zzNeg[0] != 0 {
  1647  			return "-" + strconv.FormatUint(zzNeg[0], base)
  1648  		}
  1649  	}
  1650  	zz := *z
  1651  	zz.fromMont()
  1652  	if zz.FitsOnOneWord() {
  1653  		return strconv.FormatUint(zz[0], base)
  1654  	}
  1655  	vv := pool.BigInt.Get()
  1656  	r := zz.toBigInt(vv).Text(base)
  1657  	pool.BigInt.Put(vv)
  1658  	return r
  1659  }
  1660  
  1661  // BigInt sets and return z as a *big.Int
  1662  func (z *Element) BigInt(res *big.Int) *big.Int {
  1663  	_z := *z
  1664  	_z.fromMont()
  1665  	return _z.toBigInt(res)
  1666  }
  1667  
  1668  // ToBigIntRegular returns z as a big.Int in regular form
  1669  //
  1670  // Deprecated: use BigInt(*big.Int) instead
  1671  func (z Element) ToBigIntRegular(res *big.Int) *big.Int {
  1672  	z.fromMont()
  1673  	return z.toBigInt(res)
  1674  }
  1675  
  1676  // Bits provides access to z by returning its value as a little-endian [12]uint64 array.
  1677  // Bits is intended to support implementation of missing low-level Element
  1678  // functionality outside this package; it should be avoided otherwise.
  1679  func (z *Element) Bits() [12]uint64 {
  1680  	_z := *z
  1681  	fromMont(&_z)
  1682  	return _z
  1683  }
  1684  
  1685  // Bytes returns the value of z as a big-endian byte array
  1686  func (z *Element) Bytes() (res [Bytes]byte) {
  1687  	BigEndian.PutElement(&res, *z)
  1688  	return
  1689  }
  1690  
  1691  // Marshal returns the value of z as a big-endian byte slice
  1692  func (z *Element) Marshal() []byte {
  1693  	b := z.Bytes()
  1694  	return b[:]
  1695  }
  1696  
  1697  // Unmarshal is an alias for SetBytes, it sets z to the value of e.
  1698  func (z *Element) Unmarshal(e []byte) {
  1699  	z.SetBytes(e)
  1700  }
  1701  
  1702  // SetBytes interprets e as the bytes of a big-endian unsigned integer,
  1703  // sets z to that value, and returns z.
  1704  func (z *Element) SetBytes(e []byte) *Element {
  1705  	if len(e) == Bytes {
  1706  		// fast path
  1707  		v, err := BigEndian.Element((*[Bytes]byte)(e))
  1708  		if err == nil {
  1709  			*z = v
  1710  			return z
  1711  		}
  1712  	}
  1713  
  1714  	// slow path.
  1715  	// get a big int from our pool
  1716  	vv := pool.BigInt.Get()
  1717  	vv.SetBytes(e)
  1718  
  1719  	// set big int
  1720  	z.SetBigInt(vv)
  1721  
  1722  	// put temporary object back in pool
  1723  	pool.BigInt.Put(vv)
  1724  
  1725  	return z
  1726  }
  1727  
  1728  // SetBytesCanonical interprets e as the bytes of a big-endian 96-byte integer.
  1729  // If e is not a 96-byte slice or encodes a value higher than q,
  1730  // SetBytesCanonical returns an error.
  1731  func (z *Element) SetBytesCanonical(e []byte) error {
  1732  	if len(e) != Bytes {
  1733  		return errors.New("invalid fp.Element encoding")
  1734  	}
  1735  	v, err := BigEndian.Element((*[Bytes]byte)(e))
  1736  	if err != nil {
  1737  		return err
  1738  	}
  1739  	*z = v
  1740  	return nil
  1741  }
  1742  
  1743  // SetBigInt sets z to v and returns z
  1744  func (z *Element) SetBigInt(v *big.Int) *Element {
  1745  	z.SetZero()
  1746  
  1747  	var zero big.Int
  1748  
  1749  	// fast path
  1750  	c := v.Cmp(&_modulus)
  1751  	if c == 0 {
  1752  		// v == 0
  1753  		return z
  1754  	} else if c != 1 && v.Cmp(&zero) != -1 {
  1755  		// 0 < v < q
  1756  		return z.setBigInt(v)
  1757  	}
  1758  
  1759  	// get temporary big int from the pool
  1760  	vv := pool.BigInt.Get()
  1761  
  1762  	// copy input + modular reduction
  1763  	vv.Mod(v, &_modulus)
  1764  
  1765  	// set big int byte value
  1766  	z.setBigInt(vv)
  1767  
  1768  	// release object into pool
  1769  	pool.BigInt.Put(vv)
  1770  	return z
  1771  }
  1772  
  1773  // setBigInt assumes 0 ⩽ v < q
  1774  func (z *Element) setBigInt(v *big.Int) *Element {
  1775  	vBits := v.Bits()
  1776  
  1777  	if bits.UintSize == 64 {
  1778  		for i := 0; i < len(vBits); i++ {
  1779  			z[i] = uint64(vBits[i])
  1780  		}
  1781  	} else {
  1782  		for i := 0; i < len(vBits); i++ {
  1783  			if i%2 == 0 {
  1784  				z[i/2] = uint64(vBits[i])
  1785  			} else {
  1786  				z[i/2] |= uint64(vBits[i]) << 32
  1787  			}
  1788  		}
  1789  	}
  1790  
  1791  	return z.toMont()
  1792  }
  1793  
  1794  // SetString creates a big.Int with number and calls SetBigInt on z
  1795  //
  1796  // The number prefix determines the actual base: A prefix of
  1797  // ”0b” or ”0B” selects base 2, ”0”, ”0o” or ”0O” selects base 8,
  1798  // and ”0x” or ”0X” selects base 16. Otherwise, the selected base is 10
  1799  // and no prefix is accepted.
  1800  //
  1801  // For base 16, lower and upper case letters are considered the same:
  1802  // The letters 'a' to 'f' and 'A' to 'F' represent digit values 10 to 15.
  1803  //
  1804  // An underscore character ”_” may appear between a base
  1805  // prefix and an adjacent digit, and between successive digits; such
  1806  // underscores do not change the value of the number.
  1807  // Incorrect placement of underscores is reported as a panic if there
  1808  // are no other errors.
  1809  //
  1810  // If the number is invalid this method leaves z unchanged and returns nil, error.
  1811  func (z *Element) SetString(number string) (*Element, error) {
  1812  	// get temporary big int from the pool
  1813  	vv := pool.BigInt.Get()
  1814  
  1815  	if _, ok := vv.SetString(number, 0); !ok {
  1816  		return nil, errors.New("Element.SetString failed -> can't parse number into a big.Int " + number)
  1817  	}
  1818  
  1819  	z.SetBigInt(vv)
  1820  
  1821  	// release object into pool
  1822  	pool.BigInt.Put(vv)
  1823  
  1824  	return z, nil
  1825  }
  1826  
  1827  // MarshalJSON returns json encoding of z (z.Text(10))
  1828  // If z == nil, returns null
  1829  func (z *Element) MarshalJSON() ([]byte, error) {
  1830  	if z == nil {
  1831  		return []byte("null"), nil
  1832  	}
  1833  	const maxSafeBound = 15 // we encode it as number if it's small
  1834  	s := z.Text(10)
  1835  	if len(s) <= maxSafeBound {
  1836  		return []byte(s), nil
  1837  	}
  1838  	var sbb strings.Builder
  1839  	sbb.WriteByte('"')
  1840  	sbb.WriteString(s)
  1841  	sbb.WriteByte('"')
  1842  	return []byte(sbb.String()), nil
  1843  }
  1844  
  1845  // UnmarshalJSON accepts numbers and strings as input
  1846  // See Element.SetString for valid prefixes (0x, 0b, ...)
  1847  func (z *Element) UnmarshalJSON(data []byte) error {
  1848  	s := string(data)
  1849  	if len(s) > Bits*3 {
  1850  		return errors.New("value too large (max = Element.Bits * 3)")
  1851  	}
  1852  
  1853  	// we accept numbers and strings, remove leading and trailing quotes if any
  1854  	if len(s) > 0 && s[0] == '"' {
  1855  		s = s[1:]
  1856  	}
  1857  	if len(s) > 0 && s[len(s)-1] == '"' {
  1858  		s = s[:len(s)-1]
  1859  	}
  1860  
  1861  	// get temporary big int from the pool
  1862  	vv := pool.BigInt.Get()
  1863  
  1864  	if _, ok := vv.SetString(s, 0); !ok {
  1865  		return errors.New("can't parse into a big.Int: " + s)
  1866  	}
  1867  
  1868  	z.SetBigInt(vv)
  1869  
  1870  	// release object into pool
  1871  	pool.BigInt.Put(vv)
  1872  	return nil
  1873  }
  1874  
  1875  // A ByteOrder specifies how to convert byte slices into a Element
  1876  type ByteOrder interface {
  1877  	Element(*[Bytes]byte) (Element, error)
  1878  	PutElement(*[Bytes]byte, Element)
  1879  	String() string
  1880  }
  1881  
  1882  // BigEndian is the big-endian implementation of ByteOrder and AppendByteOrder.
  1883  var BigEndian bigEndian
  1884  
  1885  type bigEndian struct{}
  1886  
  1887  // Element interpret b is a big-endian 96-byte slice.
  1888  // If b encodes a value higher than q, Element returns error.
  1889  func (bigEndian) Element(b *[Bytes]byte) (Element, error) {
  1890  	var z Element
  1891  	z[0] = binary.BigEndian.Uint64((*b)[88:96])
  1892  	z[1] = binary.BigEndian.Uint64((*b)[80:88])
  1893  	z[2] = binary.BigEndian.Uint64((*b)[72:80])
  1894  	z[3] = binary.BigEndian.Uint64((*b)[64:72])
  1895  	z[4] = binary.BigEndian.Uint64((*b)[56:64])
  1896  	z[5] = binary.BigEndian.Uint64((*b)[48:56])
  1897  	z[6] = binary.BigEndian.Uint64((*b)[40:48])
  1898  	z[7] = binary.BigEndian.Uint64((*b)[32:40])
  1899  	z[8] = binary.BigEndian.Uint64((*b)[24:32])
  1900  	z[9] = binary.BigEndian.Uint64((*b)[16:24])
  1901  	z[10] = binary.BigEndian.Uint64((*b)[8:16])
  1902  	z[11] = binary.BigEndian.Uint64((*b)[0:8])
  1903  
  1904  	if !z.smallerThanModulus() {
  1905  		return Element{}, errors.New("invalid fp.Element encoding")
  1906  	}
  1907  
  1908  	z.toMont()
  1909  	return z, nil
  1910  }
  1911  
  1912  func (bigEndian) PutElement(b *[Bytes]byte, e Element) {
  1913  	e.fromMont()
  1914  	binary.BigEndian.PutUint64((*b)[88:96], e[0])
  1915  	binary.BigEndian.PutUint64((*b)[80:88], e[1])
  1916  	binary.BigEndian.PutUint64((*b)[72:80], e[2])
  1917  	binary.BigEndian.PutUint64((*b)[64:72], e[3])
  1918  	binary.BigEndian.PutUint64((*b)[56:64], e[4])
  1919  	binary.BigEndian.PutUint64((*b)[48:56], e[5])
  1920  	binary.BigEndian.PutUint64((*b)[40:48], e[6])
  1921  	binary.BigEndian.PutUint64((*b)[32:40], e[7])
  1922  	binary.BigEndian.PutUint64((*b)[24:32], e[8])
  1923  	binary.BigEndian.PutUint64((*b)[16:24], e[9])
  1924  	binary.BigEndian.PutUint64((*b)[8:16], e[10])
  1925  	binary.BigEndian.PutUint64((*b)[0:8], e[11])
  1926  }
  1927  
  1928  func (bigEndian) String() string { return "BigEndian" }
  1929  
  1930  // LittleEndian is the little-endian implementation of ByteOrder and AppendByteOrder.
  1931  var LittleEndian littleEndian
  1932  
  1933  type littleEndian struct{}
  1934  
  1935  func (littleEndian) Element(b *[Bytes]byte) (Element, error) {
  1936  	var z Element
  1937  	z[0] = binary.LittleEndian.Uint64((*b)[0:8])
  1938  	z[1] = binary.LittleEndian.Uint64((*b)[8:16])
  1939  	z[2] = binary.LittleEndian.Uint64((*b)[16:24])
  1940  	z[3] = binary.LittleEndian.Uint64((*b)[24:32])
  1941  	z[4] = binary.LittleEndian.Uint64((*b)[32:40])
  1942  	z[5] = binary.LittleEndian.Uint64((*b)[40:48])
  1943  	z[6] = binary.LittleEndian.Uint64((*b)[48:56])
  1944  	z[7] = binary.LittleEndian.Uint64((*b)[56:64])
  1945  	z[8] = binary.LittleEndian.Uint64((*b)[64:72])
  1946  	z[9] = binary.LittleEndian.Uint64((*b)[72:80])
  1947  	z[10] = binary.LittleEndian.Uint64((*b)[80:88])
  1948  	z[11] = binary.LittleEndian.Uint64((*b)[88:96])
  1949  
  1950  	if !z.smallerThanModulus() {
  1951  		return Element{}, errors.New("invalid fp.Element encoding")
  1952  	}
  1953  
  1954  	z.toMont()
  1955  	return z, nil
  1956  }
  1957  
  1958  func (littleEndian) PutElement(b *[Bytes]byte, e Element) {
  1959  	e.fromMont()
  1960  	binary.LittleEndian.PutUint64((*b)[0:8], e[0])
  1961  	binary.LittleEndian.PutUint64((*b)[8:16], e[1])
  1962  	binary.LittleEndian.PutUint64((*b)[16:24], e[2])
  1963  	binary.LittleEndian.PutUint64((*b)[24:32], e[3])
  1964  	binary.LittleEndian.PutUint64((*b)[32:40], e[4])
  1965  	binary.LittleEndian.PutUint64((*b)[40:48], e[5])
  1966  	binary.LittleEndian.PutUint64((*b)[48:56], e[6])
  1967  	binary.LittleEndian.PutUint64((*b)[56:64], e[7])
  1968  	binary.LittleEndian.PutUint64((*b)[64:72], e[8])
  1969  	binary.LittleEndian.PutUint64((*b)[72:80], e[9])
  1970  	binary.LittleEndian.PutUint64((*b)[80:88], e[10])
  1971  	binary.LittleEndian.PutUint64((*b)[88:96], e[11])
  1972  }
  1973  
  1974  func (littleEndian) String() string { return "LittleEndian" }
  1975  
  1976  // Legendre returns the Legendre symbol of z (either +1, -1, or 0.)
  1977  func (z *Element) Legendre() int {
  1978  	var l Element
  1979  	// z^((q-1)/2)
  1980  	l.expByLegendreExp(*z)
  1981  
  1982  	if l.IsZero() {
  1983  		return 0
  1984  	}
  1985  
  1986  	// if l == 1
  1987  	if l.IsOne() {
  1988  		return 1
  1989  	}
  1990  	return -1
  1991  }
  1992  
  1993  // Sqrt z = √x (mod q)
  1994  // if the square root doesn't exist (x is not a square mod q)
  1995  // Sqrt leaves z unchanged and returns nil
  1996  func (z *Element) Sqrt(x *Element) *Element {
  1997  	// q ≡ 3 (mod 4)
  1998  	// using  z ≡ ± x^((p+1)/4) (mod q)
  1999  	var y, square Element
  2000  	y.expBySqrtExp(*x)
  2001  	// as we didn't compute the legendre symbol, ensure we found y such that y * y = x
  2002  	square.Square(&y)
  2003  	if square.Equal(x) {
  2004  		return z.Set(&y)
  2005  	}
  2006  	return nil
  2007  }
  2008  
  2009  const (
  2010  	k               = 32 // word size / 2
  2011  	signBitSelector = uint64(1) << 63
  2012  	approxLowBitsN  = k - 1
  2013  	approxHighBitsN = k + 1
  2014  )
  2015  
  2016  const (
  2017  	inversionCorrectionFactorWord0  = 10031381836020524396
  2018  	inversionCorrectionFactorWord1  = 13512348327667036555
  2019  	inversionCorrectionFactorWord2  = 4712458382768910368
  2020  	inversionCorrectionFactorWord3  = 10608536430169864474
  2021  	inversionCorrectionFactorWord4  = 6011516409926961524
  2022  	inversionCorrectionFactorWord5  = 453532925360796333
  2023  	inversionCorrectionFactorWord6  = 12814326068023107562
  2024  	inversionCorrectionFactorWord7  = 4764844688547002673
  2025  	inversionCorrectionFactorWord8  = 297975318568649638
  2026  	inversionCorrectionFactorWord9  = 3076225984588847531
  2027  	inversionCorrectionFactorWord10 = 327844855039329024
  2028  	inversionCorrectionFactorWord11 = 65221078716978344
  2029  	invIterationsN                  = 50
  2030  )
  2031  
  2032  // Inverse z = x⁻¹ (mod q)
  2033  //
  2034  // if x == 0, sets and returns z = x
  2035  func (z *Element) Inverse(x *Element) *Element {
  2036  	// Implements "Optimized Binary GCD for Modular Inversion"
  2037  	// https://github.com/pornin/bingcd/blob/main/doc/bingcd.pdf
  2038  
  2039  	a := *x
  2040  	b := Element{
  2041  		q0,
  2042  		q1,
  2043  		q2,
  2044  		q3,
  2045  		q4,
  2046  		q5,
  2047  		q6,
  2048  		q7,
  2049  		q8,
  2050  		q9,
  2051  		q10,
  2052  		q11,
  2053  	} // b := q
  2054  
  2055  	u := Element{1}
  2056  
  2057  	// Update factors: we get [u; v] ← [f₀ g₀; f₁ g₁] [u; v]
  2058  	// cᵢ = fᵢ + 2³¹ - 1 + 2³² * (gᵢ + 2³¹ - 1)
  2059  	var c0, c1 int64
  2060  
  2061  	// Saved update factors to reduce the number of field multiplications
  2062  	var pf0, pf1, pg0, pg1 int64
  2063  
  2064  	var i uint
  2065  
  2066  	var v, s Element
  2067  
  2068  	// Since u,v are updated every other iteration, we must make sure we terminate after evenly many iterations
  2069  	// This also lets us get away with half as many updates to u,v
  2070  	// To make this constant-time-ish, replace the condition with i < invIterationsN
  2071  	for i = 0; i&1 == 1 || !a.IsZero(); i++ {
  2072  		n := max(a.BitLen(), b.BitLen())
  2073  		aApprox, bApprox := approximate(&a, n), approximate(&b, n)
  2074  
  2075  		// f₀, g₀, f₁, g₁ = 1, 0, 0, 1
  2076  		c0, c1 = updateFactorIdentityMatrixRow0, updateFactorIdentityMatrixRow1
  2077  
  2078  		for j := 0; j < approxLowBitsN; j++ {
  2079  
  2080  			// -2ʲ < f₀, f₁ ≤ 2ʲ
  2081  			// |f₀| + |f₁| < 2ʲ⁺¹
  2082  
  2083  			if aApprox&1 == 0 {
  2084  				aApprox /= 2
  2085  			} else {
  2086  				s, borrow := bits.Sub64(aApprox, bApprox, 0)
  2087  				if borrow == 1 {
  2088  					s = bApprox - aApprox
  2089  					bApprox = aApprox
  2090  					c0, c1 = c1, c0
  2091  					// invariants unchanged
  2092  				}
  2093  
  2094  				aApprox = s / 2
  2095  				c0 = c0 - c1
  2096  
  2097  				// Now |f₀| < 2ʲ⁺¹ ≤ 2ʲ⁺¹ (only the weaker inequality is needed, strictly speaking)
  2098  				// Started with f₀ > -2ʲ and f₁ ≤ 2ʲ, so f₀ - f₁ > -2ʲ⁺¹
  2099  				// Invariants unchanged for f₁
  2100  			}
  2101  
  2102  			c1 *= 2
  2103  			// -2ʲ⁺¹ < f₁ ≤ 2ʲ⁺¹
  2104  			// So now |f₀| + |f₁| < 2ʲ⁺²
  2105  		}
  2106  
  2107  		s = a
  2108  
  2109  		var g0 int64
  2110  		// from this point on c0 aliases for f0
  2111  		c0, g0 = updateFactorsDecompose(c0)
  2112  		aHi := a.linearCombNonModular(&s, c0, &b, g0)
  2113  		if aHi&signBitSelector != 0 {
  2114  			// if aHi < 0
  2115  			c0, g0 = -c0, -g0
  2116  			aHi = negL(&a, aHi)
  2117  		}
  2118  		// right-shift a by k-1 bits
  2119  		a[0] = (a[0] >> approxLowBitsN) | ((a[1]) << approxHighBitsN)
  2120  		a[1] = (a[1] >> approxLowBitsN) | ((a[2]) << approxHighBitsN)
  2121  		a[2] = (a[2] >> approxLowBitsN) | ((a[3]) << approxHighBitsN)
  2122  		a[3] = (a[3] >> approxLowBitsN) | ((a[4]) << approxHighBitsN)
  2123  		a[4] = (a[4] >> approxLowBitsN) | ((a[5]) << approxHighBitsN)
  2124  		a[5] = (a[5] >> approxLowBitsN) | ((a[6]) << approxHighBitsN)
  2125  		a[6] = (a[6] >> approxLowBitsN) | ((a[7]) << approxHighBitsN)
  2126  		a[7] = (a[7] >> approxLowBitsN) | ((a[8]) << approxHighBitsN)
  2127  		a[8] = (a[8] >> approxLowBitsN) | ((a[9]) << approxHighBitsN)
  2128  		a[9] = (a[9] >> approxLowBitsN) | ((a[10]) << approxHighBitsN)
  2129  		a[10] = (a[10] >> approxLowBitsN) | ((a[11]) << approxHighBitsN)
  2130  		a[11] = (a[11] >> approxLowBitsN) | (aHi << approxHighBitsN)
  2131  
  2132  		var f1 int64
  2133  		// from this point on c1 aliases for g0
  2134  		f1, c1 = updateFactorsDecompose(c1)
  2135  		bHi := b.linearCombNonModular(&s, f1, &b, c1)
  2136  		if bHi&signBitSelector != 0 {
  2137  			// if bHi < 0
  2138  			f1, c1 = -f1, -c1
  2139  			bHi = negL(&b, bHi)
  2140  		}
  2141  		// right-shift b by k-1 bits
  2142  		b[0] = (b[0] >> approxLowBitsN) | ((b[1]) << approxHighBitsN)
  2143  		b[1] = (b[1] >> approxLowBitsN) | ((b[2]) << approxHighBitsN)
  2144  		b[2] = (b[2] >> approxLowBitsN) | ((b[3]) << approxHighBitsN)
  2145  		b[3] = (b[3] >> approxLowBitsN) | ((b[4]) << approxHighBitsN)
  2146  		b[4] = (b[4] >> approxLowBitsN) | ((b[5]) << approxHighBitsN)
  2147  		b[5] = (b[5] >> approxLowBitsN) | ((b[6]) << approxHighBitsN)
  2148  		b[6] = (b[6] >> approxLowBitsN) | ((b[7]) << approxHighBitsN)
  2149  		b[7] = (b[7] >> approxLowBitsN) | ((b[8]) << approxHighBitsN)
  2150  		b[8] = (b[8] >> approxLowBitsN) | ((b[9]) << approxHighBitsN)
  2151  		b[9] = (b[9] >> approxLowBitsN) | ((b[10]) << approxHighBitsN)
  2152  		b[10] = (b[10] >> approxLowBitsN) | ((b[11]) << approxHighBitsN)
  2153  		b[11] = (b[11] >> approxLowBitsN) | (bHi << approxHighBitsN)
  2154  
  2155  		if i&1 == 1 {
  2156  			// Combine current update factors with previously stored ones
  2157  			// [F₀, G₀; F₁, G₁] ← [f₀, g₀; f₁, g₁] [pf₀, pg₀; pf₁, pg₁], with capital letters denoting new combined values
  2158  			// We get |F₀| = | f₀pf₀ + g₀pf₁ | ≤ |f₀pf₀| + |g₀pf₁| = |f₀| |pf₀| + |g₀| |pf₁| ≤ 2ᵏ⁻¹|pf₀| + 2ᵏ⁻¹|pf₁|
  2159  			// = 2ᵏ⁻¹ (|pf₀| + |pf₁|) < 2ᵏ⁻¹ 2ᵏ = 2²ᵏ⁻¹
  2160  			// So |F₀| < 2²ᵏ⁻¹ meaning it fits in a 2k-bit signed register
  2161  
  2162  			// c₀ aliases f₀, c₁ aliases g₁
  2163  			c0, g0, f1, c1 = c0*pf0+g0*pf1,
  2164  				c0*pg0+g0*pg1,
  2165  				f1*pf0+c1*pf1,
  2166  				f1*pg0+c1*pg1
  2167  
  2168  			s = u
  2169  
  2170  			// 0 ≤ u, v < 2²⁵⁵
  2171  			// |F₀|, |G₀| < 2⁶³
  2172  			u.linearComb(&u, c0, &v, g0)
  2173  			// |F₁|, |G₁| < 2⁶³
  2174  			v.linearComb(&s, f1, &v, c1)
  2175  
  2176  		} else {
  2177  			// Save update factors
  2178  			pf0, pg0, pf1, pg1 = c0, g0, f1, c1
  2179  		}
  2180  	}
  2181  
  2182  	// For every iteration that we miss, v is not being multiplied by 2ᵏ⁻²
  2183  	const pSq uint64 = 1 << (2 * (k - 1))
  2184  	a = Element{pSq}
  2185  	// If the function is constant-time ish, this loop will not run (no need to take it out explicitly)
  2186  	for ; i < invIterationsN; i += 2 {
  2187  		// could optimize further with mul by word routine or by pre-computing a table since with k=26,
  2188  		// we would multiply by pSq up to 13times;
  2189  		// on x86, the assembly routine outperforms generic code for mul by word
  2190  		// on arm64, we may loose up to ~5% for 6 limbs
  2191  		v.Mul(&v, &a)
  2192  	}
  2193  
  2194  	u.Set(x) // for correctness check
  2195  
  2196  	z.Mul(&v, &Element{
  2197  		inversionCorrectionFactorWord0,
  2198  		inversionCorrectionFactorWord1,
  2199  		inversionCorrectionFactorWord2,
  2200  		inversionCorrectionFactorWord3,
  2201  		inversionCorrectionFactorWord4,
  2202  		inversionCorrectionFactorWord5,
  2203  		inversionCorrectionFactorWord6,
  2204  		inversionCorrectionFactorWord7,
  2205  		inversionCorrectionFactorWord8,
  2206  		inversionCorrectionFactorWord9,
  2207  		inversionCorrectionFactorWord10,
  2208  		inversionCorrectionFactorWord11,
  2209  	})
  2210  
  2211  	// correctness check
  2212  	v.Mul(&u, z)
  2213  	if !v.IsOne() && !u.IsZero() {
  2214  		return z.inverseExp(u)
  2215  	}
  2216  
  2217  	return z
  2218  }
  2219  
  2220  // inverseExp computes z = x⁻¹ (mod q) = x**(q-2) (mod q)
  2221  func (z *Element) inverseExp(x Element) *Element {
  2222  	// e == q-2
  2223  	e := Modulus()
  2224  	e.Sub(e, big.NewInt(2))
  2225  
  2226  	z.Set(&x)
  2227  
  2228  	for i := e.BitLen() - 2; i >= 0; i-- {
  2229  		z.Square(z)
  2230  		if e.Bit(i) == 1 {
  2231  			z.Mul(z, &x)
  2232  		}
  2233  	}
  2234  
  2235  	return z
  2236  }
  2237  
  2238  // approximate a big number x into a single 64 bit word using its uppermost and lowermost bits
  2239  // if x fits in a word as is, no approximation necessary
  2240  func approximate(x *Element, nBits int) uint64 {
  2241  
  2242  	if nBits <= 64 {
  2243  		return x[0]
  2244  	}
  2245  
  2246  	const mask = (uint64(1) << (k - 1)) - 1 // k-1 ones
  2247  	lo := mask & x[0]
  2248  
  2249  	hiWordIndex := (nBits - 1) / 64
  2250  
  2251  	hiWordBitsAvailable := nBits - hiWordIndex*64
  2252  	hiWordBitsUsed := min(hiWordBitsAvailable, approxHighBitsN)
  2253  
  2254  	mask_ := uint64(^((1 << (hiWordBitsAvailable - hiWordBitsUsed)) - 1))
  2255  	hi := (x[hiWordIndex] & mask_) << (64 - hiWordBitsAvailable)
  2256  
  2257  	mask_ = ^(1<<(approxLowBitsN+hiWordBitsUsed) - 1)
  2258  	mid := (mask_ & x[hiWordIndex-1]) >> hiWordBitsUsed
  2259  
  2260  	return lo | mid | hi
  2261  }
  2262  
  2263  // linearComb z = xC * x + yC * y;
  2264  // 0 ≤ x, y < 2⁷⁶¹
  2265  // |xC|, |yC| < 2⁶³
  2266  func (z *Element) linearComb(x *Element, xC int64, y *Element, yC int64) {
  2267  	// | (hi, z) | < 2 * 2⁶³ * 2⁷⁶¹ = 2⁸²⁵
  2268  	// therefore | hi | < 2⁵⁷ ≤ 2⁶³
  2269  	hi := z.linearCombNonModular(x, xC, y, yC)
  2270  	z.montReduceSigned(z, hi)
  2271  }
  2272  
  2273  // montReduceSigned z = (xHi * r + x) * r⁻¹ using the SOS algorithm
  2274  // Requires |xHi| < 2⁶³. Most significant bit of xHi is the sign bit.
  2275  func (z *Element) montReduceSigned(x *Element, xHi uint64) {
  2276  	const signBitRemover = ^signBitSelector
  2277  	mustNeg := xHi&signBitSelector != 0
  2278  	// the SOS implementation requires that most significant bit is 0
  2279  	// Let X be xHi*r + x
  2280  	// If X is negative we would have initially stored it as 2⁶⁴ r + X (à la 2's complement)
  2281  	xHi &= signBitRemover
  2282  	// with this a negative X is now represented as 2⁶³ r + X
  2283  
  2284  	var t [2*Limbs - 1]uint64
  2285  	var C uint64
  2286  
  2287  	m := x[0] * qInvNeg
  2288  
  2289  	C = madd0(m, q0, x[0])
  2290  	C, t[1] = madd2(m, q1, x[1], C)
  2291  	C, t[2] = madd2(m, q2, x[2], C)
  2292  	C, t[3] = madd2(m, q3, x[3], C)
  2293  	C, t[4] = madd2(m, q4, x[4], C)
  2294  	C, t[5] = madd2(m, q5, x[5], C)
  2295  	C, t[6] = madd2(m, q6, x[6], C)
  2296  	C, t[7] = madd2(m, q7, x[7], C)
  2297  	C, t[8] = madd2(m, q8, x[8], C)
  2298  	C, t[9] = madd2(m, q9, x[9], C)
  2299  	C, t[10] = madd2(m, q10, x[10], C)
  2300  	C, t[11] = madd2(m, q11, x[11], C)
  2301  
  2302  	// m * qElement[11] ≤ (2⁶⁴ - 1) * (2⁶³ - 1) = 2¹²⁷ - 2⁶⁴ - 2⁶³ + 1
  2303  	// x[11] + C ≤ 2*(2⁶⁴ - 1) = 2⁶⁵ - 2
  2304  	// On LHS, (C, t[11]) ≤ 2¹²⁷ - 2⁶⁴ - 2⁶³ + 1 + 2⁶⁵ - 2 = 2¹²⁷ + 2⁶³ - 1
  2305  	// So on LHS, C ≤ 2⁶³
  2306  	t[12] = xHi + C
  2307  	// xHi + C < 2⁶³ + 2⁶³ = 2⁶⁴
  2308  
  2309  	// <standard SOS>
  2310  	{
  2311  		const i = 1
  2312  		m = t[i] * qInvNeg
  2313  
  2314  		C = madd0(m, q0, t[i+0])
  2315  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2316  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2317  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2318  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2319  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2320  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2321  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2322  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2323  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2324  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2325  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2326  
  2327  		t[i+Limbs] += C
  2328  	}
  2329  	{
  2330  		const i = 2
  2331  		m = t[i] * qInvNeg
  2332  
  2333  		C = madd0(m, q0, t[i+0])
  2334  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2335  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2336  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2337  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2338  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2339  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2340  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2341  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2342  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2343  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2344  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2345  
  2346  		t[i+Limbs] += C
  2347  	}
  2348  	{
  2349  		const i = 3
  2350  		m = t[i] * qInvNeg
  2351  
  2352  		C = madd0(m, q0, t[i+0])
  2353  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2354  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2355  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2356  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2357  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2358  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2359  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2360  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2361  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2362  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2363  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2364  
  2365  		t[i+Limbs] += C
  2366  	}
  2367  	{
  2368  		const i = 4
  2369  		m = t[i] * qInvNeg
  2370  
  2371  		C = madd0(m, q0, t[i+0])
  2372  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2373  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2374  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2375  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2376  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2377  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2378  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2379  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2380  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2381  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2382  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2383  
  2384  		t[i+Limbs] += C
  2385  	}
  2386  	{
  2387  		const i = 5
  2388  		m = t[i] * qInvNeg
  2389  
  2390  		C = madd0(m, q0, t[i+0])
  2391  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2392  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2393  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2394  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2395  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2396  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2397  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2398  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2399  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2400  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2401  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2402  
  2403  		t[i+Limbs] += C
  2404  	}
  2405  	{
  2406  		const i = 6
  2407  		m = t[i] * qInvNeg
  2408  
  2409  		C = madd0(m, q0, t[i+0])
  2410  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2411  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2412  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2413  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2414  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2415  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2416  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2417  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2418  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2419  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2420  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2421  
  2422  		t[i+Limbs] += C
  2423  	}
  2424  	{
  2425  		const i = 7
  2426  		m = t[i] * qInvNeg
  2427  
  2428  		C = madd0(m, q0, t[i+0])
  2429  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2430  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2431  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2432  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2433  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2434  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2435  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2436  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2437  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2438  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2439  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2440  
  2441  		t[i+Limbs] += C
  2442  	}
  2443  	{
  2444  		const i = 8
  2445  		m = t[i] * qInvNeg
  2446  
  2447  		C = madd0(m, q0, t[i+0])
  2448  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2449  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2450  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2451  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2452  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2453  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2454  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2455  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2456  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2457  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2458  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2459  
  2460  		t[i+Limbs] += C
  2461  	}
  2462  	{
  2463  		const i = 9
  2464  		m = t[i] * qInvNeg
  2465  
  2466  		C = madd0(m, q0, t[i+0])
  2467  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2468  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2469  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2470  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2471  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2472  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2473  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2474  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2475  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2476  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2477  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2478  
  2479  		t[i+Limbs] += C
  2480  	}
  2481  	{
  2482  		const i = 10
  2483  		m = t[i] * qInvNeg
  2484  
  2485  		C = madd0(m, q0, t[i+0])
  2486  		C, t[i+1] = madd2(m, q1, t[i+1], C)
  2487  		C, t[i+2] = madd2(m, q2, t[i+2], C)
  2488  		C, t[i+3] = madd2(m, q3, t[i+3], C)
  2489  		C, t[i+4] = madd2(m, q4, t[i+4], C)
  2490  		C, t[i+5] = madd2(m, q5, t[i+5], C)
  2491  		C, t[i+6] = madd2(m, q6, t[i+6], C)
  2492  		C, t[i+7] = madd2(m, q7, t[i+7], C)
  2493  		C, t[i+8] = madd2(m, q8, t[i+8], C)
  2494  		C, t[i+9] = madd2(m, q9, t[i+9], C)
  2495  		C, t[i+10] = madd2(m, q10, t[i+10], C)
  2496  		C, t[i+11] = madd2(m, q11, t[i+11], C)
  2497  
  2498  		t[i+Limbs] += C
  2499  	}
  2500  	{
  2501  		const i = 11
  2502  		m := t[i] * qInvNeg
  2503  
  2504  		C = madd0(m, q0, t[i+0])
  2505  		C, z[0] = madd2(m, q1, t[i+1], C)
  2506  		C, z[1] = madd2(m, q2, t[i+2], C)
  2507  		C, z[2] = madd2(m, q3, t[i+3], C)
  2508  		C, z[3] = madd2(m, q4, t[i+4], C)
  2509  		C, z[4] = madd2(m, q5, t[i+5], C)
  2510  		C, z[5] = madd2(m, q6, t[i+6], C)
  2511  		C, z[6] = madd2(m, q7, t[i+7], C)
  2512  		C, z[7] = madd2(m, q8, t[i+8], C)
  2513  		C, z[8] = madd2(m, q9, t[i+9], C)
  2514  		C, z[9] = madd2(m, q10, t[i+10], C)
  2515  		z[11], z[10] = madd2(m, q11, t[i+11], C)
  2516  	}
  2517  
  2518  	// if z ⩾ q → z -= q
  2519  	if !z.smallerThanModulus() {
  2520  		var b uint64
  2521  		z[0], b = bits.Sub64(z[0], q0, 0)
  2522  		z[1], b = bits.Sub64(z[1], q1, b)
  2523  		z[2], b = bits.Sub64(z[2], q2, b)
  2524  		z[3], b = bits.Sub64(z[3], q3, b)
  2525  		z[4], b = bits.Sub64(z[4], q4, b)
  2526  		z[5], b = bits.Sub64(z[5], q5, b)
  2527  		z[6], b = bits.Sub64(z[6], q6, b)
  2528  		z[7], b = bits.Sub64(z[7], q7, b)
  2529  		z[8], b = bits.Sub64(z[8], q8, b)
  2530  		z[9], b = bits.Sub64(z[9], q9, b)
  2531  		z[10], b = bits.Sub64(z[10], q10, b)
  2532  		z[11], _ = bits.Sub64(z[11], q11, b)
  2533  	}
  2534  	// </standard SOS>
  2535  
  2536  	if mustNeg {
  2537  		// We have computed ( 2⁶³ r + X ) r⁻¹ = 2⁶³ + X r⁻¹ instead
  2538  		var b uint64
  2539  		z[0], b = bits.Sub64(z[0], signBitSelector, 0)
  2540  		z[1], b = bits.Sub64(z[1], 0, b)
  2541  		z[2], b = bits.Sub64(z[2], 0, b)
  2542  		z[3], b = bits.Sub64(z[3], 0, b)
  2543  		z[4], b = bits.Sub64(z[4], 0, b)
  2544  		z[5], b = bits.Sub64(z[5], 0, b)
  2545  		z[6], b = bits.Sub64(z[6], 0, b)
  2546  		z[7], b = bits.Sub64(z[7], 0, b)
  2547  		z[8], b = bits.Sub64(z[8], 0, b)
  2548  		z[9], b = bits.Sub64(z[9], 0, b)
  2549  		z[10], b = bits.Sub64(z[10], 0, b)
  2550  		z[11], b = bits.Sub64(z[11], 0, b)
  2551  
  2552  		// Occurs iff x == 0 && xHi < 0, i.e. X = rX' for -2⁶³ ≤ X' < 0
  2553  
  2554  		if b != 0 {
  2555  			// z[11] = -1
  2556  			// negative: add q
  2557  			const neg1 = 0xFFFFFFFFFFFFFFFF
  2558  
  2559  			var carry uint64
  2560  
  2561  			z[0], carry = bits.Add64(z[0], q0, 0)
  2562  			z[1], carry = bits.Add64(z[1], q1, carry)
  2563  			z[2], carry = bits.Add64(z[2], q2, carry)
  2564  			z[3], carry = bits.Add64(z[3], q3, carry)
  2565  			z[4], carry = bits.Add64(z[4], q4, carry)
  2566  			z[5], carry = bits.Add64(z[5], q5, carry)
  2567  			z[6], carry = bits.Add64(z[6], q6, carry)
  2568  			z[7], carry = bits.Add64(z[7], q7, carry)
  2569  			z[8], carry = bits.Add64(z[8], q8, carry)
  2570  			z[9], carry = bits.Add64(z[9], q9, carry)
  2571  			z[10], carry = bits.Add64(z[10], q10, carry)
  2572  			z[11], _ = bits.Add64(neg1, q11, carry)
  2573  		}
  2574  	}
  2575  }
  2576  
  2577  const (
  2578  	updateFactorsConversionBias    int64 = 0x7fffffff7fffffff // (2³¹ - 1)(2³² + 1)
  2579  	updateFactorIdentityMatrixRow0       = 1
  2580  	updateFactorIdentityMatrixRow1       = 1 << 32
  2581  )
  2582  
  2583  func updateFactorsDecompose(c int64) (int64, int64) {
  2584  	c += updateFactorsConversionBias
  2585  	const low32BitsFilter int64 = 0xFFFFFFFF
  2586  	f := c&low32BitsFilter - 0x7FFFFFFF
  2587  	g := c>>32&low32BitsFilter - 0x7FFFFFFF
  2588  	return f, g
  2589  }
  2590  
  2591  // negL negates in place [x | xHi] and return the new most significant word xHi
  2592  func negL(x *Element, xHi uint64) uint64 {
  2593  	var b uint64
  2594  
  2595  	x[0], b = bits.Sub64(0, x[0], 0)
  2596  	x[1], b = bits.Sub64(0, x[1], b)
  2597  	x[2], b = bits.Sub64(0, x[2], b)
  2598  	x[3], b = bits.Sub64(0, x[3], b)
  2599  	x[4], b = bits.Sub64(0, x[4], b)
  2600  	x[5], b = bits.Sub64(0, x[5], b)
  2601  	x[6], b = bits.Sub64(0, x[6], b)
  2602  	x[7], b = bits.Sub64(0, x[7], b)
  2603  	x[8], b = bits.Sub64(0, x[8], b)
  2604  	x[9], b = bits.Sub64(0, x[9], b)
  2605  	x[10], b = bits.Sub64(0, x[10], b)
  2606  	x[11], b = bits.Sub64(0, x[11], b)
  2607  	xHi, _ = bits.Sub64(0, xHi, b)
  2608  
  2609  	return xHi
  2610  }
  2611  
  2612  // mulWNonModular multiplies by one word in non-montgomery, without reducing
  2613  func (z *Element) mulWNonModular(x *Element, y int64) uint64 {
  2614  
  2615  	// w := abs(y)
  2616  	m := y >> 63
  2617  	w := uint64((y ^ m) - m)
  2618  
  2619  	var c uint64
  2620  	c, z[0] = bits.Mul64(x[0], w)
  2621  	c, z[1] = madd1(x[1], w, c)
  2622  	c, z[2] = madd1(x[2], w, c)
  2623  	c, z[3] = madd1(x[3], w, c)
  2624  	c, z[4] = madd1(x[4], w, c)
  2625  	c, z[5] = madd1(x[5], w, c)
  2626  	c, z[6] = madd1(x[6], w, c)
  2627  	c, z[7] = madd1(x[7], w, c)
  2628  	c, z[8] = madd1(x[8], w, c)
  2629  	c, z[9] = madd1(x[9], w, c)
  2630  	c, z[10] = madd1(x[10], w, c)
  2631  	c, z[11] = madd1(x[11], w, c)
  2632  
  2633  	if y < 0 {
  2634  		c = negL(z, c)
  2635  	}
  2636  
  2637  	return c
  2638  }
  2639  
  2640  // linearCombNonModular computes a linear combination without modular reduction
  2641  func (z *Element) linearCombNonModular(x *Element, xC int64, y *Element, yC int64) uint64 {
  2642  	var yTimes Element
  2643  
  2644  	yHi := yTimes.mulWNonModular(y, yC)
  2645  	xHi := z.mulWNonModular(x, xC)
  2646  
  2647  	var carry uint64
  2648  	z[0], carry = bits.Add64(z[0], yTimes[0], 0)
  2649  	z[1], carry = bits.Add64(z[1], yTimes[1], carry)
  2650  	z[2], carry = bits.Add64(z[2], yTimes[2], carry)
  2651  	z[3], carry = bits.Add64(z[3], yTimes[3], carry)
  2652  	z[4], carry = bits.Add64(z[4], yTimes[4], carry)
  2653  	z[5], carry = bits.Add64(z[5], yTimes[5], carry)
  2654  	z[6], carry = bits.Add64(z[6], yTimes[6], carry)
  2655  	z[7], carry = bits.Add64(z[7], yTimes[7], carry)
  2656  	z[8], carry = bits.Add64(z[8], yTimes[8], carry)
  2657  	z[9], carry = bits.Add64(z[9], yTimes[9], carry)
  2658  	z[10], carry = bits.Add64(z[10], yTimes[10], carry)
  2659  	z[11], carry = bits.Add64(z[11], yTimes[11], carry)
  2660  
  2661  	yHi, _ = bits.Add64(xHi, yHi, carry)
  2662  
  2663  	return yHi
  2664  }