github.com/ericlagergren/ctb@v0.0.0-20220810041818-96749d9c394d/xbits/arith.go

github.com/ericlagergren/ctb@v0.0.0-20220810041818-96749d9c394d/xbits/arith.go (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package xbits
     6  
     7  // Many of the loops in this file are of the form
     8  //   for i := 0; i < len(z) && i < len(x) && i < len(y); i++
     9  // i < len(z) is the real condition.
    10  // However, checking i < len(x) && i < len(y) as well is faster than
    11  // having the compiler do a bounds check in the body of the loop;
    12  // remarkably it is even faster than hoisting the bounds check
    13  // out of the loop, by doing something like
    14  //   _, _ = x[len(z)-1], y[len(z)-1]
    15  // There are other ways to hoist the bounds check out of the loop,
    16  // but the compiler's BCE isn't powerful enough for them (yet?).
    17  // See the discussion in CL 164966.
    18  
    19  import "math/bits"
    20  
    21  func mulAddVWW(z, x []uint64, y, r uint64) (c uint64) {
    22  	c = r
    23  	// The comment near the top of this file discusses this for loop condition.
    24  	for i := 0; i < len(z) && i < len(x); i++ {
    25  		c, z[i] = mul128(x[i], y, c)
    26  	}
    27  	return
    28  }
    29  
    30  // The resulting carry c is either 0 or 1.
    31  func addVV(z, x, y []uint64) (c uint64) {
    32  	// The comment near the top of this file discusses this for loop condition.
    33  	for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
    34  		zi, cc := bits.Add64(x[i], y[i], c)
    35  		z[i] = zi
    36  		c = cc
    37  	}
    38  	return
    39  }
    40  
    41  // The resulting carry c is either 0 or 1.
    42  func subVV(z, x, y []uint64) (c uint64) {
    43  	// The comment near the top of this file discusses this for loop condition.
    44  	for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
    45  		zi, cc := bits.Sub(uint(x[i]), uint(y[i]), uint(c))
    46  		z[i] = uint64(zi)
    47  		c = uint64(cc)
    48  	}
    49  	return
    50  }
    51  
    52  // q = ( x1 << _W + x0 - r)/y. m = floor(( _B^2 - 1 ) / d - _B). Requiring x1<y.
    53  // An approximate reciprocal with a reference to "Improved Division by Invariant Integers
    54  // (IEEE Transactions on Computers, 11 Jun. 2010)"
    55  func divWW(x1, x0, y, m uint64) (q, r uint64) {
    56  	s := bits.LeadingZeros64(y)
    57  	if s != 0 {
    58  		x1 = x1<<s | x0>>(64-s)
    59  		x0 <<= s
    60  		y <<= s
    61  	}
    62  	d := uint(y)
    63  	// We know that
    64  	//   m = ⎣(B^2-1)/d⎦-B
    65  	//   ⎣(B^2-1)/d⎦ = m+B
    66  	//   (B^2-1)/d = m+B+delta1    0 <= delta1 <= (d-1)/d
    67  	//   B^2/d = m+B+delta2        0 <= delta2 <= 1
    68  	// The quotient we're trying to compute is
    69  	//   quotient = ⎣(x1*B+x0)/d⎦
    70  	//            = ⎣(x1*B*(B^2/d)+x0*(B^2/d))/B^2⎦
    71  	//            = ⎣(x1*B*(m+B+delta2)+x0*(m+B+delta2))/B^2⎦
    72  	//            = ⎣(x1*m+x1*B+x0)/B + x0*m/B^2 + delta2*(x1*B+x0)/B^2⎦
    73  	// The latter two terms of this three-term sum are between 0 and 1.
    74  	// So we can compute just the first term, and we will be low by at most 2.
    75  	t1, t0 := bits.Mul(uint(m), uint(x1))
    76  	_, c := bits.Add(t0, uint(x0), 0)
    77  	t1, _ = bits.Add(t1, uint(x1), c)
    78  	// The quotient is either t1, t1+1, or t1+2.
    79  	// We'll try t1 and adjust if needed.
    80  	qq := t1
    81  	// compute remainder r=x-d*q.
    82  	dq1, dq0 := bits.Mul(d, qq)
    83  	r0, b := bits.Sub(uint(x0), dq0, 0)
    84  	r1, _ := bits.Sub(uint(x1), dq1, b)
    85  	// The remainder we just computed is bounded above by B+d:
    86  	// r = x1*B + x0 - d*q.
    87  	//   = x1*B + x0 - d*⎣(x1*m+x1*B+x0)/B⎦
    88  	//   = x1*B + x0 - d*((x1*m+x1*B+x0)/B-alpha)                                   0 <= alpha < 1
    89  	//   = x1*B + x0 - x1*d/B*m                         - x1*d - x0*d/B + d*alpha
    90  	//   = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦             - x1*d - x0*d/B + d*alpha
    91  	//   = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦             - x1*d - x0*d/B + d*alpha
    92  	//   = x1*B + x0 - x1*d/B*((B^2-1)/d-B-beta)        - x1*d - x0*d/B + d*alpha   0 <= beta < 1
    93  	//   = x1*B + x0 - x1*B + x1/B + x1*d + x1*d/B*beta - x1*d - x0*d/B + d*alpha
    94  	//   =        x0        + x1/B        + x1*d/B*beta        - x0*d/B + d*alpha
    95  	//   = x0*(1-d/B) + x1*(1+d*beta)/B + d*alpha
    96  	//   <  B*(1-d/B) +  d*B/B          + d          because x0<B (and 1-d/B>0), x1<d, 1+d*beta<=B, alpha<1
    97  	//   =  B - d     +  d              + d
    98  	//   = B+d
    99  	// So r1 can only be 0 or 1. If r1 is 1, then we know q was too small.
   100  	// Add 1 to q and subtract d from r. That guarantees that r is <B, so
   101  	// we no longer need to keep track of r1.
   102  	if r1 != 0 {
   103  		qq++
   104  		r0 -= d
   105  	}
   106  	// If the remainder is still too large, increment q one more time.
   107  	if r0 >= d {
   108  		qq++
   109  		r0 -= d
   110  	}
   111  	return uint64(qq), uint64(r0 >> s)
   112  }
   113  
   114  // greaterThan reports whether (x1<<_W + x2) > (y1<<_W + y2)
   115  func greaterThan(x1, x2, y1, y2 uint64) bool {
   116  	return x1 > y1 || x1 == y1 && x2 > y2
   117  }
   118  
   119  func reciprocal(d1 uint64) uint64 {
   120  	const mask = 1<<64 - 1
   121  	u := d1 << bits.LeadingZeros64(d1)
   122  	// (_B^2-1)/U-_B = (_B*(_M-C)+_M)/U
   123  	rec, _ := bits.Div64(^u, mask, u)
   124  	return rec
   125  }