github.com/rsc/go@v0.0.0-20150416155037-e040fd465409/src/cmd/internal/gc/big/float.go

github.com/rsc/go@v0.0.0-20150416155037-e040fd465409/src/cmd/internal/gc/big/float.go (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements multi-precision floating-point numbers.
     6  // Like in the GNU MPFR library (http://www.mpfr.org/), operands
     7  // can be of mixed precision. Unlike MPFR, the rounding mode is
     8  // not specified with each operation, but with each operand. The
     9  // rounding mode of the result operand determines the rounding
    10  // mode of an operation. This is a from-scratch implementation.
    11  
    12  package big
    13  
    14  import (
    15  	"fmt"
    16  	"math"
    17  )
    18  
    19  const debugFloat = true // enable for debugging
    20  
    21  // A nonzero finite Float represents a multi-precision floating point number
    22  //
    23  //   sign × mantissa × 2**exponent
    24  //
    25  // with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
    26  // A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
    27  // All Floats are ordered, and the ordering of two Floats x and y
    28  // is defined by x.Cmp(y).
    29  //
    30  // Each Float value also has a precision, rounding mode, and accuracy.
    31  // The precision is the maximum number of mantissa bits available to
    32  // represent the value. The rounding mode specifies how a result should
    33  // be rounded to fit into the mantissa bits, and accuracy describes the
    34  // rounding error with respect to the exact result.
    35  //
    36  // Unless specified otherwise, all operations (including setters) that
    37  // specify a *Float variable for the result (usually via the receiver
    38  // with the exception of MantExp), round the numeric result according
    39  // to the precision and rounding mode of the result variable.
    40  //
    41  // If the provided result precision is 0 (see below), it is set to the
    42  // precision of the argument with the largest precision value before any
    43  // rounding takes place, and the rounding mode remains unchanged. Thus,
    44  // uninitialized Floats provided as result arguments will have their
    45  // precision set to a reasonable value determined by the operands and
    46  // their mode is the zero value for RoundingMode (ToNearestEven).
    47  //
    48  // By setting the desired precision to 24 or 53 and using matching rounding
    49  // mode (typically ToNearestEven), Float operations produce the same results
    50  // as the corresponding float32 or float64 IEEE-754 arithmetic for operands
    51  // that correspond to normal (i.e., not denormal) float32 or float64 numbers.
    52  // Exponent underflow and overflow lead to a 0 or an Infinity for different
    53  // values than IEEE-754 because Float exponents have a much larger range.
    54  //
    55  // The zero (uninitialized) value for a Float is ready to use and represents
    56  // the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven.
    57  //
    58  type Float struct {
    59  	prec uint32
    60  	mode RoundingMode
    61  	acc  Accuracy
    62  	form form
    63  	neg  bool
    64  	mant nat
    65  	exp  int32
    66  }
    67  
    68  // Float operations that would lead to a NaN under IEEE-754 rules cause
    69  // a run-time panic of ErrNaN type.
    70  type ErrNaN struct {
    71  	msg string
    72  }
    73  
    74  // NewFloat allocates and returns a new Float set to x,
    75  // with precision 53 and rounding mode ToNearestEven.
    76  // NewFloat panics with ErrNaN if x is a NaN.
    77  func NewFloat(x float64) *Float {
    78  	if math.IsNaN(x) {
    79  		panic(ErrNaN{"NewFloat(NaN)"})
    80  	}
    81  	return new(Float).SetFloat64(x)
    82  }
    83  
    84  // Exponent and precision limits.
    85  const (
    86  	MaxExp  = math.MaxInt32  // largest supported exponent
    87  	MinExp  = math.MinInt32  // smallest supported exponent
    88  	MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
    89  )
    90  
    91  // Internal representation: The mantissa bits x.mant of a nonzero finite
    92  // Float x are stored in a nat slice long enough to hold up to x.prec bits;
    93  // the slice may (but doesn't have to) be shorter if the mantissa contains
    94  // trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
    95  // the msb is shifted all the way "to the left"). Thus, if the mantissa has
    96  // trailing 0 bits or x.prec is not a multiple of the the Word size _W,
    97  // x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
    98  // to the value 0.5; the exponent x.exp shifts the binary point as needed.
    99  //
   100  // A zero or non-finite Float x ignores x.mant and x.exp.
   101  //
   102  // x                 form      neg      mant         exp
   103  // ----------------------------------------------------------
   104  // ±0                zero      sign     -            -
   105  // 0 < |x| < +Inf    finite    sign     mantissa     exponent
   106  // ±Inf              inf       sign     -            -
   107  
   108  // A form value describes the internal representation.
   109  type form byte
   110  
   111  // The form value order is relevant - do not change!
   112  const (
   113  	zero form = iota
   114  	finite
   115  	inf
   116  )
   117  
   118  // RoundingMode determines how a Float value is rounded to the
   119  // desired precision. Rounding may change the Float value; the
   120  // rounding error is described by the Float's Accuracy.
   121  type RoundingMode byte
   122  
   123  // The following rounding modes are supported.
   124  const (
   125  	ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
   126  	ToNearestAway                     // == IEEE 754-2008 roundTiesToAway
   127  	ToZero                            // == IEEE 754-2008 roundTowardZero
   128  	AwayFromZero                      // no IEEE 754-2008 equivalent
   129  	ToNegativeInf                     // == IEEE 754-2008 roundTowardNegative
   130  	ToPositiveInf                     // == IEEE 754-2008 roundTowardPositive
   131  )
   132  
   133  //go:generate stringer -type=RoundingMode
   134  
   135  // Accuracy describes the rounding error produced by the most recent
   136  // operation that generated a Float value, relative to the exact value.
   137  type Accuracy int8
   138  
   139  // Constants describing the Accuracy of a Float.
   140  const (
   141  	Below Accuracy = -1
   142  	Exact Accuracy = 0
   143  	Above Accuracy = +1
   144  )
   145  
   146  //go:generate stringer -type=Accuracy
   147  
   148  // SetPrec sets z's precision to prec and returns the (possibly) rounded
   149  // value of z. Rounding occurs according to z's rounding mode if the mantissa
   150  // cannot be represented in prec bits without loss of precision.
   151  // SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
   152  // If prec > MaxPrec, it is set to MaxPrec.
   153  func (z *Float) SetPrec(prec uint) *Float {
   154  	z.acc = Exact // optimistically assume no rounding is needed
   155  
   156  	// special case
   157  	if prec == 0 {
   158  		z.prec = 0
   159  		if z.form == finite {
   160  			// truncate z to 0
   161  			z.acc = makeAcc(z.neg)
   162  			z.form = zero
   163  		}
   164  		return z
   165  	}
   166  
   167  	// general case
   168  	if prec > MaxPrec {
   169  		prec = MaxPrec
   170  	}
   171  	old := z.prec
   172  	z.prec = uint32(prec)
   173  	if z.prec < old {
   174  		z.round(0)
   175  	}
   176  	return z
   177  }
   178  
   179  func makeAcc(above bool) Accuracy {
   180  	if above {
   181  		return Above
   182  	}
   183  	return Below
   184  }
   185  
   186  // SetMode sets z's rounding mode to mode and returns an exact z.
   187  // z remains unchanged otherwise.
   188  // z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact.
   189  func (z *Float) SetMode(mode RoundingMode) *Float {
   190  	z.mode = mode
   191  	z.acc = Exact
   192  	return z
   193  }
   194  
   195  // Prec returns the mantissa precision of x in bits.
   196  // The result may be 0 for |x| == 0 and |x| == Inf.
   197  func (x *Float) Prec() uint {
   198  	return uint(x.prec)
   199  }
   200  
   201  // MinPrec returns the minimum precision required to represent x exactly
   202  // (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
   203  // The result is 0 for |x| == 0 and |x| == Inf.
   204  func (x *Float) MinPrec() uint {
   205  	if x.form != finite {
   206  		return 0
   207  	}
   208  	return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
   209  }
   210  
   211  // Mode returns the rounding mode of x.
   212  func (x *Float) Mode() RoundingMode {
   213  	return x.mode
   214  }
   215  
   216  // Acc returns the accuracy of x produced by the most recent operation.
   217  func (x *Float) Acc() Accuracy {
   218  	return x.acc
   219  }
   220  
   221  // Sign returns:
   222  //
   223  //	-1 if x <   0
   224  //	 0 if x is ±0
   225  //	+1 if x >   0
   226  //
   227  func (x *Float) Sign() int {
   228  	if debugFloat {
   229  		x.validate()
   230  	}
   231  	if x.form == zero {
   232  		return 0
   233  	}
   234  	if x.neg {
   235  		return -1
   236  	}
   237  	return 1
   238  }
   239  
   240  // MantExp breaks x into its mantissa and exponent components
   241  // and returns the exponent. If a non-nil mant argument is
   242  // provided its value is set to the mantissa of x, with the
   243  // same precision and rounding mode as x. The components
   244  // satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
   245  // Calling MantExp with a nil argument is an efficient way to
   246  // get the exponent of the receiver.
   247  //
   248  // Special cases are:
   249  //
   250  //	(  ±0).MantExp(mant) = 0, with mant set to   ±0
   251  //	(±Inf).MantExp(mant) = 0, with mant set to ±Inf
   252  //
   253  // x and mant may be the same in which case x is set to its
   254  // mantissa value.
   255  func (x *Float) MantExp(mant *Float) (exp int) {
   256  	if debugFloat {
   257  		x.validate()
   258  	}
   259  	if x.form == finite {
   260  		exp = int(x.exp)
   261  	}
   262  	if mant != nil {
   263  		mant.Copy(x)
   264  		if mant.form == finite {
   265  			mant.exp = 0
   266  		}
   267  	}
   268  	return
   269  }
   270  
   271  func (z *Float) setExpAndRound(exp int64, sbit uint) {
   272  	if exp < MinExp {
   273  		// underflow
   274  		z.acc = makeAcc(z.neg)
   275  		z.form = zero
   276  		return
   277  	}
   278  
   279  	if exp > MaxExp {
   280  		// overflow
   281  		z.acc = makeAcc(!z.neg)
   282  		z.form = inf
   283  		return
   284  	}
   285  
   286  	z.form = finite
   287  	z.exp = int32(exp)
   288  	z.round(sbit)
   289  }
   290  
   291  // SetMantExp sets z to mant × 2**exp and and returns z.
   292  // The result z has the same precision and rounding mode
   293  // as mant. SetMantExp is an inverse of MantExp but does
   294  // not require 0.5 <= |mant| < 1.0. Specifically:
   295  //
   296  //	mant := new(Float)
   297  //	new(Float).SetMantExp(mant, x.SetMantExp(mant)).Cmp(x).Eql() is true
   298  //
   299  // Special cases are:
   300  //
   301  //	z.SetMantExp(  ±0, exp) =   ±0
   302  //	z.SetMantExp(±Inf, exp) = ±Inf
   303  //
   304  // z and mant may be the same in which case z's exponent
   305  // is set to exp.
   306  func (z *Float) SetMantExp(mant *Float, exp int) *Float {
   307  	if debugFloat {
   308  		z.validate()
   309  		mant.validate()
   310  	}
   311  	z.Copy(mant)
   312  	if z.form != finite {
   313  		return z
   314  	}
   315  	z.setExpAndRound(int64(z.exp)+int64(exp), 0)
   316  	return z
   317  }
   318  
   319  // Signbit returns true if x is negative or negative zero.
   320  func (x *Float) Signbit() bool {
   321  	return x.neg
   322  }
   323  
   324  // IsInf reports whether x is +Inf or -Inf.
   325  func (x *Float) IsInf() bool {
   326  	return x.form == inf
   327  }
   328  
   329  // IsInt reports whether x is an integer.
   330  // ±Inf values are not integers.
   331  func (x *Float) IsInt() bool {
   332  	if debugFloat {
   333  		x.validate()
   334  	}
   335  	// special cases
   336  	if x.form != finite {
   337  		return x.form == zero
   338  	}
   339  	// x.form == finite
   340  	if x.exp <= 0 {
   341  		return false
   342  	}
   343  	// x.exp > 0
   344  	return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
   345  }
   346  
   347  // debugging support
   348  func (x *Float) validate() {
   349  	if !debugFloat {
   350  		// avoid performance bugs
   351  		panic("validate called but debugFloat is not set")
   352  	}
   353  	if x.form != finite {
   354  		return
   355  	}
   356  	m := len(x.mant)
   357  	if m == 0 {
   358  		panic("nonzero finite number with empty mantissa")
   359  	}
   360  	const msb = 1 << (_W - 1)
   361  	if x.mant[m-1]&msb == 0 {
   362  		panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Format('p', 0)))
   363  	}
   364  	if x.prec == 0 {
   365  		panic("zero precision finite number")
   366  	}
   367  }
   368  
   369  // round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
   370  // sbit must be 0 or 1 and summarizes any "sticky bit" information one might
   371  // have before calling round. z's mantissa must be normalized (with the msb set)
   372  // or empty.
   373  //
   374  // CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the
   375  // sign of z. For correct rounding, the sign of z must be set correctly before
   376  // calling round.
   377  func (z *Float) round(sbit uint) {
   378  	if debugFloat {
   379  		z.validate()
   380  		if z.form > finite {
   381  			panic(fmt.Sprintf("round called for non-finite value %s", z))
   382  		}
   383  	}
   384  	// z.form <= finite
   385  
   386  	z.acc = Exact
   387  	if z.form == zero {
   388  		return
   389  	}
   390  	// z.form == finite && len(z.mant) > 0
   391  	// m > 0 implies z.prec > 0 (checked by validate)
   392  
   393  	m := uint32(len(z.mant)) // present mantissa length in words
   394  	bits := m * _W           // present mantissa bits
   395  	if bits <= z.prec {
   396  		// mantissa fits => nothing to do
   397  		return
   398  	}
   399  	// bits > z.prec
   400  
   401  	n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
   402  
   403  	// Rounding is based on two bits: the rounding bit (rbit) and the
   404  	// sticky bit (sbit). The rbit is the bit immediately before the
   405  	// z.prec leading mantissa bits (the "0.5"). The sbit is set if any
   406  	// of the bits before the rbit are set (the "0.25", "0.125", etc.):
   407  	//
   408  	//   rbit  sbit  => "fractional part"
   409  	//
   410  	//   0     0        == 0
   411  	//   0     1        >  0  , < 0.5
   412  	//   1     0        == 0.5
   413  	//   1     1        >  0.5, < 1.0
   414  
   415  	// bits > z.prec: mantissa too large => round
   416  	r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
   417  	rbit := z.mant.bit(r)        // rounding bit
   418  	if sbit == 0 {
   419  		sbit = z.mant.sticky(r)
   420  	}
   421  	if debugFloat && sbit&^1 != 0 {
   422  		panic(fmt.Sprintf("invalid sbit %#x", sbit))
   423  	}
   424  
   425  	// convert ToXInf rounding modes
   426  	mode := z.mode
   427  	switch mode {
   428  	case ToNegativeInf:
   429  		mode = ToZero
   430  		if z.neg {
   431  			mode = AwayFromZero
   432  		}
   433  	case ToPositiveInf:
   434  		mode = AwayFromZero
   435  		if z.neg {
   436  			mode = ToZero
   437  		}
   438  	}
   439  
   440  	// cut off extra words
   441  	if m > n {
   442  		copy(z.mant, z.mant[m-n:]) // move n last words to front
   443  		z.mant = z.mant[:n]
   444  	}
   445  
   446  	// determine number of trailing zero bits t
   447  	t := n*_W - z.prec // 0 <= t < _W
   448  	lsb := Word(1) << t
   449  
   450  	// make rounding decision
   451  	// TODO(gri) This can be simplified (see Bits.round in bits_test.go).
   452  	switch mode {
   453  	case ToZero:
   454  		// nothing to do
   455  	case ToNearestEven, ToNearestAway:
   456  		if rbit == 0 {
   457  			// rounding bits == 0b0x
   458  			mode = ToZero
   459  		} else if sbit == 1 {
   460  			// rounding bits == 0b11
   461  			mode = AwayFromZero
   462  		}
   463  	case AwayFromZero:
   464  		if rbit|sbit == 0 {
   465  			mode = ToZero
   466  		}
   467  	default:
   468  		// ToXInf modes have been converted to ToZero or AwayFromZero
   469  		panic("unreachable")
   470  	}
   471  
   472  	// round and determine accuracy
   473  	switch mode {
   474  	case ToZero:
   475  		if rbit|sbit != 0 {
   476  			z.acc = Below
   477  		}
   478  
   479  	case ToNearestEven, ToNearestAway:
   480  		if debugFloat && rbit != 1 {
   481  			panic("internal error in rounding")
   482  		}
   483  		if mode == ToNearestEven && sbit == 0 && z.mant[0]&lsb == 0 {
   484  			z.acc = Below
   485  			break
   486  		}
   487  		// mode == ToNearestAway || sbit == 1 || z.mant[0]&lsb != 0
   488  		fallthrough
   489  
   490  	case AwayFromZero:
   491  		// add 1 to mantissa
   492  		if addVW(z.mant, z.mant, lsb) != 0 {
   493  			// overflow => shift mantissa right by 1 and add msb
   494  			shrVU(z.mant, z.mant, 1)
   495  			z.mant[n-1] |= 1 << (_W - 1)
   496  			// adjust exponent
   497  			if z.exp < MaxExp {
   498  				z.exp++
   499  			} else {
   500  				// exponent overflow
   501  				z.acc = makeAcc(!z.neg)
   502  				z.form = inf
   503  				return
   504  			}
   505  		}
   506  		z.acc = Above
   507  	}
   508  
   509  	// zero out trailing bits in least-significant word
   510  	z.mant[0] &^= lsb - 1
   511  
   512  	// update accuracy
   513  	if z.acc != Exact && z.neg {
   514  		z.acc = -z.acc
   515  	}
   516  
   517  	if debugFloat {
   518  		z.validate()
   519  	}
   520  
   521  	return
   522  }
   523  
   524  // nlz returns the number of leading zero bits in x.
   525  func nlz(x Word) uint {
   526  	return _W - uint(bitLen(x))
   527  }
   528  
   529  func nlz64(x uint64) uint {
   530  	// TODO(gri) this can be done more nicely
   531  	if _W == 32 {
   532  		if x>>32 == 0 {
   533  			return 32 + nlz(Word(x))
   534  		}
   535  		return nlz(Word(x >> 32))
   536  	}
   537  	if _W == 64 {
   538  		return nlz(Word(x))
   539  	}
   540  	panic("unreachable")
   541  }
   542  
   543  func (z *Float) setBits64(neg bool, x uint64) *Float {
   544  	if z.prec == 0 {
   545  		z.prec = 64
   546  	}
   547  	z.acc = Exact
   548  	z.neg = neg
   549  	if x == 0 {
   550  		z.form = zero
   551  		return z
   552  	}
   553  	// x != 0
   554  	z.form = finite
   555  	s := nlz64(x)
   556  	z.mant = z.mant.setUint64(x << s)
   557  	z.exp = int32(64 - s) // always fits
   558  	if z.prec < 64 {
   559  		z.round(0)
   560  	}
   561  	return z
   562  }
   563  
   564  // SetUint64 sets z to the (possibly rounded) value of x and returns z.
   565  // If z's precision is 0, it is changed to 64 (and rounding will have
   566  // no effect).
   567  func (z *Float) SetUint64(x uint64) *Float {
   568  	return z.setBits64(false, x)
   569  }
   570  
   571  // SetInt64 sets z to the (possibly rounded) value of x and returns z.
   572  // If z's precision is 0, it is changed to 64 (and rounding will have
   573  // no effect).
   574  func (z *Float) SetInt64(x int64) *Float {
   575  	u := x
   576  	if u < 0 {
   577  		u = -u
   578  	}
   579  	// We cannot simply call z.SetUint64(uint64(u)) and change
   580  	// the sign afterwards because the sign affects rounding.
   581  	return z.setBits64(x < 0, uint64(u))
   582  }
   583  
   584  // SetFloat64 sets z to the (possibly rounded) value of x and returns z.
   585  // If z's precision is 0, it is changed to 53 (and rounding will have
   586  // no effect). SetFloat64 panics with ErrNaN if x is a NaN.
   587  func (z *Float) SetFloat64(x float64) *Float {
   588  	if z.prec == 0 {
   589  		z.prec = 53
   590  	}
   591  	if math.IsNaN(x) {
   592  		panic(ErrNaN{"Float.SetFloat64(NaN)"})
   593  	}
   594  	z.acc = Exact
   595  	z.neg = math.Signbit(x) // handle -0, -Inf correctly
   596  	if x == 0 {
   597  		z.form = zero
   598  		return z
   599  	}
   600  	if math.IsInf(x, 0) {
   601  		z.form = inf
   602  		return z
   603  	}
   604  	// normalized x != 0
   605  	z.form = finite
   606  	fmant, exp := math.Frexp(x) // get normalized mantissa
   607  	z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
   608  	z.exp = int32(exp) // always fits
   609  	if z.prec < 53 {
   610  		z.round(0)
   611  	}
   612  	return z
   613  }
   614  
   615  // fnorm normalizes mantissa m by shifting it to the left
   616  // such that the msb of the most-significant word (msw) is 1.
   617  // It returns the shift amount. It assumes that len(m) != 0.
   618  func fnorm(m nat) int64 {
   619  	if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
   620  		panic("msw of mantissa is 0")
   621  	}
   622  	s := nlz(m[len(m)-1])
   623  	if s > 0 {
   624  		c := shlVU(m, m, s)
   625  		if debugFloat && c != 0 {
   626  			panic("nlz or shlVU incorrect")
   627  		}
   628  	}
   629  	return int64(s)
   630  }
   631  
   632  // SetInt sets z to the (possibly rounded) value of x and returns z.
   633  // If z's precision is 0, it is changed to the larger of x.BitLen()
   634  // or 64 (and rounding will have no effect).
   635  func (z *Float) SetInt(x *Int) *Float {
   636  	// TODO(gri) can be more efficient if z.prec > 0
   637  	// but small compared to the size of x, or if there
   638  	// are many trailing 0's.
   639  	bits := uint32(x.BitLen())
   640  	if z.prec == 0 {
   641  		z.prec = umax32(bits, 64)
   642  	}
   643  	z.acc = Exact
   644  	z.neg = x.neg
   645  	if len(x.abs) == 0 {
   646  		z.form = zero
   647  		return z
   648  	}
   649  	// x != 0
   650  	z.mant = z.mant.set(x.abs)
   651  	fnorm(z.mant)
   652  	z.setExpAndRound(int64(bits), 0)
   653  	return z
   654  }
   655  
   656  // SetRat sets z to the (possibly rounded) value of x and returns z.
   657  // If z's precision is 0, it is changed to the largest of a.BitLen(),
   658  // b.BitLen(), or 64; with x = a/b.
   659  func (z *Float) SetRat(x *Rat) *Float {
   660  	if x.IsInt() {
   661  		return z.SetInt(x.Num())
   662  	}
   663  	var a, b Float
   664  	a.SetInt(x.Num())
   665  	b.SetInt(x.Denom())
   666  	if z.prec == 0 {
   667  		z.prec = umax32(a.prec, b.prec)
   668  	}
   669  	return z.Quo(&a, &b)
   670  }
   671  
   672  // SetInf sets z to the infinite Float -Inf if signbit is
   673  // set, or +Inf if signbit is not set, and returns z. The
   674  // precision of z is unchanged and the result is always
   675  // Exact.
   676  func (z *Float) SetInf(signbit bool) *Float {
   677  	z.acc = Exact
   678  	z.form = inf
   679  	z.neg = signbit
   680  	return z
   681  }
   682  
   683  // Set sets z to the (possibly rounded) value of x and returns z.
   684  // If z's precision is 0, it is changed to the precision of x
   685  // before setting z (and rounding will have no effect).
   686  // Rounding is performed according to z's precision and rounding
   687  // mode; and z's accuracy reports the result error relative to the
   688  // exact (not rounded) result.
   689  func (z *Float) Set(x *Float) *Float {
   690  	if debugFloat {
   691  		x.validate()
   692  	}
   693  	z.acc = Exact
   694  	if z != x {
   695  		z.form = x.form
   696  		z.neg = x.neg
   697  		if x.form == finite {
   698  			z.exp = x.exp
   699  			z.mant = z.mant.set(x.mant)
   700  		}
   701  		if z.prec == 0 {
   702  			z.prec = x.prec
   703  		} else if z.prec < x.prec {
   704  			z.round(0)
   705  		}
   706  	}
   707  	return z
   708  }
   709  
   710  // Copy sets z to x, with the same precision, rounding mode, and
   711  // accuracy as x, and returns z. x is not changed even if z and
   712  // x are the same.
   713  func (z *Float) Copy(x *Float) *Float {
   714  	if debugFloat {
   715  		x.validate()
   716  	}
   717  	if z != x {
   718  		z.prec = x.prec
   719  		z.mode = x.mode
   720  		z.acc = x.acc
   721  		z.form = x.form
   722  		z.neg = x.neg
   723  		if z.form == finite {
   724  			z.mant = z.mant.set(x.mant)
   725  			z.exp = x.exp
   726  		}
   727  	}
   728  	return z
   729  }
   730  
   731  func high32(x nat) uint32 {
   732  	// TODO(gri) This can be done more efficiently on 32bit platforms.
   733  	return uint32(high64(x) >> 32)
   734  }
   735  
   736  func high64(x nat) uint64 {
   737  	i := len(x)
   738  	if i == 0 {
   739  		return 0
   740  	}
   741  	// i > 0
   742  	v := uint64(x[i-1])
   743  	if _W == 32 {
   744  		v <<= 32
   745  		if i > 1 {
   746  			v |= uint64(x[i-2])
   747  		}
   748  	}
   749  	return v
   750  }
   751  
   752  // Uint64 returns the unsigned integer resulting from truncating x
   753  // towards zero. If 0 <= x <= math.MaxUint64, the result is Exact
   754  // if x is an integer and Below otherwise.
   755  // The result is (0, Above) for x < 0, and (math.MaxUint64, Below)
   756  // for x > math.MaxUint64.
   757  func (x *Float) Uint64() (uint64, Accuracy) {
   758  	if debugFloat {
   759  		x.validate()
   760  	}
   761  
   762  	switch x.form {
   763  	case finite:
   764  		if x.neg {
   765  			return 0, Above
   766  		}
   767  		// 0 < x < +Inf
   768  		if x.exp <= 0 {
   769  			// 0 < x < 1
   770  			return 0, Below
   771  		}
   772  		// 1 <= x < Inf
   773  		if x.exp <= 64 {
   774  			// u = trunc(x) fits into a uint64
   775  			u := high64(x.mant) >> (64 - uint32(x.exp))
   776  			if x.MinPrec() <= 64 {
   777  				return u, Exact
   778  			}
   779  			return u, Below // x truncated
   780  		}
   781  		// x too large
   782  		return math.MaxUint64, Below
   783  
   784  	case zero:
   785  		return 0, Exact
   786  
   787  	case inf:
   788  		if x.neg {
   789  			return 0, Above
   790  		}
   791  		return math.MaxUint64, Below
   792  	}
   793  
   794  	panic("unreachable")
   795  }
   796  
   797  // Int64 returns the integer resulting from truncating x towards zero.
   798  // If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is
   799  // an integer, and Above (x < 0) or Below (x > 0) otherwise.
   800  // The result is (math.MinInt64, Above) for x < math.MinInt64,
   801  // and (math.MaxInt64, Below) for x > math.MaxInt64.
   802  func (x *Float) Int64() (int64, Accuracy) {
   803  	if debugFloat {
   804  		x.validate()
   805  	}
   806  
   807  	switch x.form {
   808  	case finite:
   809  		// 0 < |x| < +Inf
   810  		acc := makeAcc(x.neg)
   811  		if x.exp <= 0 {
   812  			// 0 < |x| < 1
   813  			return 0, acc
   814  		}
   815  		// x.exp > 0
   816  
   817  		// 1 <= |x| < +Inf
   818  		if x.exp <= 63 {
   819  			// i = trunc(x) fits into an int64 (excluding math.MinInt64)
   820  			i := int64(high64(x.mant) >> (64 - uint32(x.exp)))
   821  			if x.neg {
   822  				i = -i
   823  			}
   824  			if x.MinPrec() <= uint(x.exp) {
   825  				return i, Exact
   826  			}
   827  			return i, acc // x truncated
   828  		}
   829  		if x.neg {
   830  			// check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
   831  			if x.exp == 64 && x.MinPrec() == 1 {
   832  				acc = Exact
   833  			}
   834  			return math.MinInt64, acc
   835  		}
   836  		// x too large
   837  		return math.MaxInt64, Below
   838  
   839  	case zero:
   840  		return 0, Exact
   841  
   842  	case inf:
   843  		if x.neg {
   844  			return math.MinInt64, Above
   845  		}
   846  		return math.MaxInt64, Below
   847  	}
   848  
   849  	panic("unreachable")
   850  }
   851  
   852  // TODO(gri) Float32 and Float64 are very similar internally but for the
   853  // floatxx parameters and some conversions. Should factor out shared code.
   854  
   855  // Float32 returns the float32 value nearest to x. If x is too small to be
   856  // represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
   857  // is (0, Below) or (-0, Above), respectively, depending on the sign of x.
   858  // If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
   859  // the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
   860  func (x *Float) Float32() (float32, Accuracy) {
   861  	if debugFloat {
   862  		x.validate()
   863  	}
   864  
   865  	switch x.form {
   866  	case finite:
   867  		// 0 < |x| < +Inf
   868  
   869  		const (
   870  			fbits = 32                //        float size
   871  			mbits = 23                //        mantissa size (excluding implicit msb)
   872  			ebits = fbits - mbits - 1 //     8  exponent size
   873  			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
   874  			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
   875  			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
   876  			emax  = bias              //   127  largest unbiased exponent (normal)
   877  		)
   878  
   879  		// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
   880  		// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
   881  		// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
   882  		// corresponding Float exponent.
   883  		// (see also implementation of math.Ldexp for similar code)
   884  
   885  		if x.exp < dmin+1 {
   886  			// underflow
   887  			if x.neg {
   888  				var z float32
   889  				return -z, Above
   890  			}
   891  			return 0.0, Below
   892  		}
   893  		// x.exp >= dmin+1
   894  
   895  		var r Float
   896  		r.prec = mbits + 1 // +1 for implicit msb
   897  		if x.exp < emin+1 {
   898  			// denormal number - round to fewer bits
   899  			r.prec = uint32(x.exp - dmin)
   900  		}
   901  		r.Set(x)
   902  
   903  		// Rounding may have caused r to overflow to ±Inf
   904  		// (rounding never causes underflows to 0).
   905  		if r.form == inf {
   906  			r.exp = emax + 2 // cause overflow below
   907  		}
   908  
   909  		if r.exp > emax+1 {
   910  			// overflow
   911  			if x.neg {
   912  				return float32(math.Inf(-1)), Below
   913  			}
   914  			return float32(math.Inf(+1)), Above
   915  		}
   916  		// dmin+1 <= r.exp <= emax+1
   917  
   918  		var s uint32
   919  		if r.neg {
   920  			s = 1 << (fbits - 1)
   921  		}
   922  
   923  		m := high32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
   924  
   925  		// Rounding may have caused a denormal number to
   926  		// become normal. Check again.
   927  		c := float32(1.0)
   928  		if r.exp < emin+1 {
   929  			// denormal number
   930  			r.exp += mbits
   931  			c = 1.0 / (1 << mbits) // 2**-mbits
   932  		}
   933  		// emin+1 <= r.exp <= emax+1
   934  		e := uint32(r.exp-emin) << mbits
   935  
   936  		return c * math.Float32frombits(s|e|m), r.acc
   937  
   938  	case zero:
   939  		if x.neg {
   940  			var z float32
   941  			return -z, Exact
   942  		}
   943  		return 0.0, Exact
   944  
   945  	case inf:
   946  		if x.neg {
   947  			return float32(math.Inf(-1)), Exact
   948  		}
   949  		return float32(math.Inf(+1)), Exact
   950  	}
   951  
   952  	panic("unreachable")
   953  }
   954  
   955  // Float64 returns the float64 value nearest to x. If x is too small to be
   956  // represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
   957  // is (0, Below) or (-0, Above), respectively, depending on the sign of x.
   958  // If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
   959  // the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
   960  func (x *Float) Float64() (float64, Accuracy) {
   961  	if debugFloat {
   962  		x.validate()
   963  	}
   964  
   965  	switch x.form {
   966  	case finite:
   967  		// 0 < |x| < +Inf
   968  
   969  		const (
   970  			fbits = 64                //        float size
   971  			mbits = 52                //        mantissa size (excluding implicit msb)
   972  			ebits = fbits - mbits - 1 //    11  exponent size
   973  			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
   974  			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
   975  			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
   976  			emax  = bias              //  1023  largest unbiased exponent (normal)
   977  		)
   978  
   979  		// Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
   980  		// floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
   981  		// For a given mantissa m, we need to add 1 to a floatxx exponent to get the
   982  		// corresponding Float exponent.
   983  		// (see also implementation of math.Ldexp for similar code)
   984  
   985  		if x.exp < dmin+1 {
   986  			// underflow
   987  			if x.neg {
   988  				var z float64
   989  				return -z, Above
   990  			}
   991  			return 0.0, Below
   992  		}
   993  		// x.exp >= dmin+1
   994  
   995  		var r Float
   996  		r.prec = mbits + 1 // +1 for implicit msb
   997  		if x.exp < emin+1 {
   998  			// denormal number - round to fewer bits
   999  			r.prec = uint32(x.exp - dmin)
  1000  		}
  1001  		r.Set(x)
  1002  
  1003  		// Rounding may have caused r to overflow to ±Inf
  1004  		// (rounding never causes underflows to 0).
  1005  		if r.form == inf {
  1006  			r.exp = emax + 2 // cause overflow below
  1007  		}
  1008  
  1009  		if r.exp > emax+1 {
  1010  			// overflow
  1011  			if x.neg {
  1012  				return math.Inf(-1), Below
  1013  			}
  1014  			return math.Inf(+1), Above
  1015  		}
  1016  		// dmin+1 <= r.exp <= emax+1
  1017  
  1018  		var s uint64
  1019  		if r.neg {
  1020  			s = 1 << (fbits - 1)
  1021  		}
  1022  
  1023  		m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
  1024  
  1025  		// Rounding may have caused a denormal number to
  1026  		// become normal. Check again.
  1027  		c := 1.0
  1028  		if r.exp < emin+1 {
  1029  			// denormal number
  1030  			r.exp += mbits
  1031  			c = 1.0 / (1 << mbits) // 2**-mbits
  1032  		}
  1033  		// emin+1 <= r.exp <= emax+1
  1034  		e := uint64(r.exp-emin) << mbits
  1035  
  1036  		return c * math.Float64frombits(s|e|m), r.acc
  1037  
  1038  	case zero:
  1039  		if x.neg {
  1040  			var z float64
  1041  			return -z, Exact
  1042  		}
  1043  		return 0.0, Exact
  1044  
  1045  	case inf:
  1046  		if x.neg {
  1047  			return math.Inf(-1), Exact
  1048  		}
  1049  		return math.Inf(+1), Exact
  1050  	}
  1051  
  1052  	panic("unreachable")
  1053  }
  1054  
  1055  // Int returns the result of truncating x towards zero;
  1056  // or nil if x is an infinity.
  1057  // The result is Exact if x.IsInt(); otherwise it is Below
  1058  // for x > 0, and Above for x < 0.
  1059  // If a non-nil *Int argument z is provided, Int stores
  1060  // the result in z instead of allocating a new Int.
  1061  func (x *Float) Int(z *Int) (*Int, Accuracy) {
  1062  	if debugFloat {
  1063  		x.validate()
  1064  	}
  1065  
  1066  	if z == nil && x.form <= finite {
  1067  		z = new(Int)
  1068  	}
  1069  
  1070  	switch x.form {
  1071  	case finite:
  1072  		// 0 < |x| < +Inf
  1073  		acc := makeAcc(x.neg)
  1074  		if x.exp <= 0 {
  1075  			// 0 < |x| < 1
  1076  			return z.SetInt64(0), acc
  1077  		}
  1078  		// x.exp > 0
  1079  
  1080  		// 1 <= |x| < +Inf
  1081  		// determine minimum required precision for x
  1082  		allBits := uint(len(x.mant)) * _W
  1083  		exp := uint(x.exp)
  1084  		if x.MinPrec() <= exp {
  1085  			acc = Exact
  1086  		}
  1087  		// shift mantissa as needed
  1088  		if z == nil {
  1089  			z = new(Int)
  1090  		}
  1091  		z.neg = x.neg
  1092  		switch {
  1093  		case exp > allBits:
  1094  			z.abs = z.abs.shl(x.mant, exp-allBits)
  1095  		default:
  1096  			z.abs = z.abs.set(x.mant)
  1097  		case exp < allBits:
  1098  			z.abs = z.abs.shr(x.mant, allBits-exp)
  1099  		}
  1100  		return z, acc
  1101  
  1102  	case zero:
  1103  		return z.SetInt64(0), Exact
  1104  
  1105  	case inf:
  1106  		return nil, makeAcc(x.neg)
  1107  	}
  1108  
  1109  	panic("unreachable")
  1110  }
  1111  
  1112  // Rat returns the rational number corresponding to x;
  1113  // or nil if x is an infinity.
  1114  // The result is Exact is x is not an Inf.
  1115  // If a non-nil *Rat argument z is provided, Rat stores
  1116  // the result in z instead of allocating a new Rat.
  1117  func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
  1118  	if debugFloat {
  1119  		x.validate()
  1120  	}
  1121  
  1122  	if z == nil && x.form <= finite {
  1123  		z = new(Rat)
  1124  	}
  1125  
  1126  	switch x.form {
  1127  	case finite:
  1128  		// 0 < |x| < +Inf
  1129  		allBits := int32(len(x.mant)) * _W
  1130  		// build up numerator and denominator
  1131  		z.a.neg = x.neg
  1132  		switch {
  1133  		case x.exp > allBits:
  1134  			z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
  1135  			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
  1136  			// z already in normal form
  1137  		default:
  1138  			z.a.abs = z.a.abs.set(x.mant)
  1139  			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
  1140  			// z already in normal form
  1141  		case x.exp < allBits:
  1142  			z.a.abs = z.a.abs.set(x.mant)
  1143  			t := z.b.abs.setUint64(1)
  1144  			z.b.abs = t.shl(t, uint(allBits-x.exp))
  1145  			z.norm()
  1146  		}
  1147  		return z, Exact
  1148  
  1149  	case zero:
  1150  		return z.SetInt64(0), Exact
  1151  
  1152  	case inf:
  1153  		return nil, makeAcc(x.neg)
  1154  	}
  1155  
  1156  	panic("unreachable")
  1157  }
  1158  
  1159  // Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
  1160  // and returns z.
  1161  func (z *Float) Abs(x *Float) *Float {
  1162  	z.Set(x)
  1163  	z.neg = false
  1164  	return z
  1165  }
  1166  
  1167  // Neg sets z to the (possibly rounded) value of x with its sign negated,
  1168  // and returns z.
  1169  func (z *Float) Neg(x *Float) *Float {
  1170  	z.Set(x)
  1171  	z.neg = !z.neg
  1172  	return z
  1173  }
  1174  
  1175  func validateBinaryOperands(x, y *Float) {
  1176  	if !debugFloat {
  1177  		// avoid performance bugs
  1178  		panic("validateBinaryOperands called but debugFloat is not set")
  1179  	}
  1180  	if len(x.mant) == 0 {
  1181  		panic("empty mantissa for x")
  1182  	}
  1183  	if len(y.mant) == 0 {
  1184  		panic("empty mantissa for y")
  1185  	}
  1186  }
  1187  
  1188  // z = x + y, ignoring signs of x and y for the addition
  1189  // but using the sign of z for rounding the result.
  1190  // x and y must have a non-empty mantissa and valid exponent.
  1191  func (z *Float) uadd(x, y *Float) {
  1192  	// Note: This implementation requires 2 shifts most of the
  1193  	// time. It is also inefficient if exponents or precisions
  1194  	// differ by wide margins. The following article describes
  1195  	// an efficient (but much more complicated) implementation
  1196  	// compatible with the internal representation used here:
  1197  	//
  1198  	// Vincent Lefèvre: "The Generic Multiple-Precision Floating-
  1199  	// Point Addition With Exact Rounding (as in the MPFR Library)"
  1200  	// http://www.vinc17.net/research/papers/rnc6.pdf
  1201  
  1202  	if debugFloat {
  1203  		validateBinaryOperands(x, y)
  1204  	}
  1205  
  1206  	// compute exponents ex, ey for mantissa with "binary point"
  1207  	// on the right (mantissa.0) - use int64 to avoid overflow
  1208  	ex := int64(x.exp) - int64(len(x.mant))*_W
  1209  	ey := int64(y.exp) - int64(len(y.mant))*_W
  1210  
  1211  	// TODO(gri) having a combined add-and-shift primitive
  1212  	//           could make this code significantly faster
  1213  	switch {
  1214  	case ex < ey:
  1215  		// cannot re-use z.mant w/o testing for aliasing
  1216  		t := nat(nil).shl(y.mant, uint(ey-ex))
  1217  		z.mant = z.mant.add(x.mant, t)
  1218  	default:
  1219  		// ex == ey, no shift needed
  1220  		z.mant = z.mant.add(x.mant, y.mant)
  1221  	case ex > ey:
  1222  		// cannot re-use z.mant w/o testing for aliasing
  1223  		t := nat(nil).shl(x.mant, uint(ex-ey))
  1224  		z.mant = z.mant.add(t, y.mant)
  1225  		ex = ey
  1226  	}
  1227  	// len(z.mant) > 0
  1228  
  1229  	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
  1230  }
  1231  
  1232  // z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
  1233  // but using the sign of z for rounding the result.
  1234  // x and y must have a non-empty mantissa and valid exponent.
  1235  func (z *Float) usub(x, y *Float) {
  1236  	// This code is symmetric to uadd.
  1237  	// We have not factored the common code out because
  1238  	// eventually uadd (and usub) should be optimized
  1239  	// by special-casing, and the code will diverge.
  1240  
  1241  	if debugFloat {
  1242  		validateBinaryOperands(x, y)
  1243  	}
  1244  
  1245  	ex := int64(x.exp) - int64(len(x.mant))*_W
  1246  	ey := int64(y.exp) - int64(len(y.mant))*_W
  1247  
  1248  	switch {
  1249  	case ex < ey:
  1250  		// cannot re-use z.mant w/o testing for aliasing
  1251  		t := nat(nil).shl(y.mant, uint(ey-ex))
  1252  		z.mant = t.sub(x.mant, t)
  1253  	default:
  1254  		// ex == ey, no shift needed
  1255  		z.mant = z.mant.sub(x.mant, y.mant)
  1256  	case ex > ey:
  1257  		// cannot re-use z.mant w/o testing for aliasing
  1258  		t := nat(nil).shl(x.mant, uint(ex-ey))
  1259  		z.mant = t.sub(t, y.mant)
  1260  		ex = ey
  1261  	}
  1262  
  1263  	// operands may have cancelled each other out
  1264  	if len(z.mant) == 0 {
  1265  		z.acc = Exact
  1266  		z.form = zero
  1267  		z.neg = false
  1268  		return
  1269  	}
  1270  	// len(z.mant) > 0
  1271  
  1272  	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
  1273  }
  1274  
  1275  // z = x * y, ignoring signs of x and y for the multiplication
  1276  // but using the sign of z for rounding the result.
  1277  // x and y must have a non-empty mantissa and valid exponent.
  1278  func (z *Float) umul(x, y *Float) {
  1279  	if debugFloat {
  1280  		validateBinaryOperands(x, y)
  1281  	}
  1282  
  1283  	// Note: This is doing too much work if the precision
  1284  	// of z is less than the sum of the precisions of x
  1285  	// and y which is often the case (e.g., if all floats
  1286  	// have the same precision).
  1287  	// TODO(gri) Optimize this for the common case.
  1288  
  1289  	e := int64(x.exp) + int64(y.exp)
  1290  	z.mant = z.mant.mul(x.mant, y.mant)
  1291  
  1292  	z.setExpAndRound(e-fnorm(z.mant), 0)
  1293  }
  1294  
  1295  // z = x / y, ignoring signs of x and y for the division
  1296  // but using the sign of z for rounding the result.
  1297  // x and y must have a non-empty mantissa and valid exponent.
  1298  func (z *Float) uquo(x, y *Float) {
  1299  	if debugFloat {
  1300  		validateBinaryOperands(x, y)
  1301  	}
  1302  
  1303  	// mantissa length in words for desired result precision + 1
  1304  	// (at least one extra bit so we get the rounding bit after
  1305  	// the division)
  1306  	n := int(z.prec/_W) + 1
  1307  
  1308  	// compute adjusted x.mant such that we get enough result precision
  1309  	xadj := x.mant
  1310  	if d := n - len(x.mant) + len(y.mant); d > 0 {
  1311  		// d extra words needed => add d "0 digits" to x
  1312  		xadj = make(nat, len(x.mant)+d)
  1313  		copy(xadj[d:], x.mant)
  1314  	}
  1315  	// TODO(gri): If we have too many digits (d < 0), we should be able
  1316  	// to shorten x for faster division. But we must be extra careful
  1317  	// with rounding in that case.
  1318  
  1319  	// Compute d before division since there may be aliasing of x.mant
  1320  	// (via xadj) or y.mant with z.mant.
  1321  	d := len(xadj) - len(y.mant)
  1322  
  1323  	// divide
  1324  	var r nat
  1325  	z.mant, r = z.mant.div(nil, xadj, y.mant)
  1326  	e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
  1327  
  1328  	// The result is long enough to include (at least) the rounding bit.
  1329  	// If there's a non-zero remainder, the corresponding fractional part
  1330  	// (if it were computed), would have a non-zero sticky bit (if it were
  1331  	// zero, it couldn't have a non-zero remainder).
  1332  	var sbit uint
  1333  	if len(r) > 0 {
  1334  		sbit = 1
  1335  	}
  1336  
  1337  	z.setExpAndRound(e-fnorm(z.mant), sbit)
  1338  }
  1339  
  1340  // ucmp returns -1, 0, or +1, depending on whether
  1341  // |x| < |y|, |x| == |y|, or |x| > |y|.
  1342  // x and y must have a non-empty mantissa and valid exponent.
  1343  func (x *Float) ucmp(y *Float) int {
  1344  	if debugFloat {
  1345  		validateBinaryOperands(x, y)
  1346  	}
  1347  
  1348  	switch {
  1349  	case x.exp < y.exp:
  1350  		return -1
  1351  	case x.exp > y.exp:
  1352  		return +1
  1353  	}
  1354  	// x.exp == y.exp
  1355  
  1356  	// compare mantissas
  1357  	i := len(x.mant)
  1358  	j := len(y.mant)
  1359  	for i > 0 || j > 0 {
  1360  		var xm, ym Word
  1361  		if i > 0 {
  1362  			i--
  1363  			xm = x.mant[i]
  1364  		}
  1365  		if j > 0 {
  1366  			j--
  1367  			ym = y.mant[j]
  1368  		}
  1369  		switch {
  1370  		case xm < ym:
  1371  			return -1
  1372  		case xm > ym:
  1373  			return +1
  1374  		}
  1375  	}
  1376  
  1377  	return 0
  1378  }
  1379  
  1380  // Handling of sign bit as defined by IEEE 754-2008, section 6.3:
  1381  //
  1382  // When neither the inputs nor result are NaN, the sign of a product or
  1383  // quotient is the exclusive OR of the operands’ signs; the sign of a sum,
  1384  // or of a difference x−y regarded as a sum x+(−y), differs from at most
  1385  // one of the addends’ signs; and the sign of the result of conversions,
  1386  // the quantize operation, the roundToIntegral operations, and the
  1387  // roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
  1388  // These rules shall apply even when operands or results are zero or infinite.
  1389  //
  1390  // When the sum of two operands with opposite signs (or the difference of
  1391  // two operands with like signs) is exactly zero, the sign of that sum (or
  1392  // difference) shall be +0 in all rounding-direction attributes except
  1393  // roundTowardNegative; under that attribute, the sign of an exact zero
  1394  // sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
  1395  // sign as x even when x is zero.
  1396  //
  1397  // See also: http://play.golang.org/p/RtH3UCt5IH
  1398  
  1399  // Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
  1400  // it is changed to the larger of x's or y's precision before the operation.
  1401  // Rounding is performed according to z's precision and rounding mode; and
  1402  // z's accuracy reports the result error relative to the exact (not rounded)
  1403  // result. Add panics with ErrNaN if x and y are infinities with opposite
  1404  // signs. The value of z is undefined in that case.
  1405  //
  1406  // BUG(gri) When rounding ToNegativeInf, the sign of Float values rounded to 0 is incorrect.
  1407  func (z *Float) Add(x, y *Float) *Float {
  1408  	if debugFloat {
  1409  		x.validate()
  1410  		y.validate()
  1411  	}
  1412  
  1413  	if z.prec == 0 {
  1414  		z.prec = umax32(x.prec, y.prec)
  1415  	}
  1416  
  1417  	if x.form == finite && y.form == finite {
  1418  		// x + y (commom case)
  1419  		z.neg = x.neg
  1420  		if x.neg == y.neg {
  1421  			// x + y == x + y
  1422  			// (-x) + (-y) == -(x + y)
  1423  			z.uadd(x, y)
  1424  		} else {
  1425  			// x + (-y) == x - y == -(y - x)
  1426  			// (-x) + y == y - x == -(x - y)
  1427  			if x.ucmp(y) > 0 {
  1428  				z.usub(x, y)
  1429  			} else {
  1430  				z.neg = !z.neg
  1431  				z.usub(y, x)
  1432  			}
  1433  		}
  1434  		return z
  1435  	}
  1436  
  1437  	if x.form == inf && y.form == inf && x.neg != y.neg {
  1438  		// +Inf + -Inf
  1439  		// -Inf + +Inf
  1440  		// value of z is undefined but make sure it's valid
  1441  		z.acc = Exact
  1442  		z.form = zero
  1443  		z.neg = false
  1444  		panic(ErrNaN{"addition of infinities with opposite signs"})
  1445  	}
  1446  
  1447  	if x.form == zero && y.form == zero {
  1448  		// ±0 + ±0
  1449  		z.acc = Exact
  1450  		z.form = zero
  1451  		z.neg = x.neg && y.neg // -0 + -0 == -0
  1452  		return z
  1453  	}
  1454  
  1455  	if x.form == inf || y.form == zero {
  1456  		// ±Inf + y
  1457  		// x + ±0
  1458  		return z.Set(x)
  1459  	}
  1460  
  1461  	// ±0 + y
  1462  	// x + ±Inf
  1463  	return z.Set(y)
  1464  }
  1465  
  1466  // Sub sets z to the rounded difference x-y and returns z.
  1467  // Precision, rounding, and accuracy reporting are as for Add.
  1468  // Sub panics with ErrNaN if x and y are infinities with equal
  1469  // signs. The value of z is undefined in that case.
  1470  func (z *Float) Sub(x, y *Float) *Float {
  1471  	if debugFloat {
  1472  		x.validate()
  1473  		y.validate()
  1474  	}
  1475  
  1476  	if z.prec == 0 {
  1477  		z.prec = umax32(x.prec, y.prec)
  1478  	}
  1479  
  1480  	if x.form == finite && y.form == finite {
  1481  		// x - y (common case)
  1482  		z.neg = x.neg
  1483  		if x.neg != y.neg {
  1484  			// x - (-y) == x + y
  1485  			// (-x) - y == -(x + y)
  1486  			z.uadd(x, y)
  1487  		} else {
  1488  			// x - y == x - y == -(y - x)
  1489  			// (-x) - (-y) == y - x == -(x - y)
  1490  			if x.ucmp(y) > 0 {
  1491  				z.usub(x, y)
  1492  			} else {
  1493  				z.neg = !z.neg
  1494  				z.usub(y, x)
  1495  			}
  1496  		}
  1497  		return z
  1498  	}
  1499  
  1500  	if x.form == inf && y.form == inf && x.neg == y.neg {
  1501  		// +Inf - +Inf
  1502  		// -Inf - -Inf
  1503  		// value of z is undefined but make sure it's valid
  1504  		z.acc = Exact
  1505  		z.form = zero
  1506  		z.neg = false
  1507  		panic(ErrNaN{"subtraction of infinities with equal signs"})
  1508  	}
  1509  
  1510  	if x.form == zero && y.form == zero {
  1511  		// ±0 - ±0
  1512  		z.acc = Exact
  1513  		z.form = zero
  1514  		z.neg = x.neg && !y.neg // -0 - +0 == -0
  1515  		return z
  1516  	}
  1517  
  1518  	if x.form == inf || y.form == zero {
  1519  		// ±Inf - y
  1520  		// x - ±0
  1521  		return z.Set(x)
  1522  	}
  1523  
  1524  	// ±0 - y
  1525  	// x - ±Inf
  1526  	return z.Neg(y)
  1527  }
  1528  
  1529  // Mul sets z to the rounded product x*y and returns z.
  1530  // Precision, rounding, and accuracy reporting are as for Add.
  1531  // Mul panics with ErrNaN if one operand is zero and the other
  1532  // operand an infinity. The value of z is undefined in that case.
  1533  func (z *Float) Mul(x, y *Float) *Float {
  1534  	if debugFloat {
  1535  		x.validate()
  1536  		y.validate()
  1537  	}
  1538  
  1539  	if z.prec == 0 {
  1540  		z.prec = umax32(x.prec, y.prec)
  1541  	}
  1542  
  1543  	z.neg = x.neg != y.neg
  1544  
  1545  	if x.form == finite && y.form == finite {
  1546  		// x * y (common case)
  1547  		z.umul(x, y)
  1548  		return z
  1549  	}
  1550  
  1551  	z.acc = Exact
  1552  	if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
  1553  		// ±0 * ±Inf
  1554  		// ±Inf * ±0
  1555  		// value of z is undefined but make sure it's valid
  1556  		z.form = zero
  1557  		z.neg = false
  1558  		panic(ErrNaN{"multiplication of zero with infinity"})
  1559  	}
  1560  
  1561  	if x.form == inf || y.form == inf {
  1562  		// ±Inf * y
  1563  		// x * ±Inf
  1564  		z.form = inf
  1565  		return z
  1566  	}
  1567  
  1568  	// ±0 * y
  1569  	// x * ±0
  1570  	z.form = zero
  1571  	return z
  1572  }
  1573  
  1574  // Quo sets z to the rounded quotient x/y and returns z.
  1575  // Precision, rounding, and accuracy reporting are as for Add.
  1576  // Quo panics with ErrNaN if both operands are zero or infinities.
  1577  // The value of z is undefined in that case.
  1578  func (z *Float) Quo(x, y *Float) *Float {
  1579  	if debugFloat {
  1580  		x.validate()
  1581  		y.validate()
  1582  	}
  1583  
  1584  	if z.prec == 0 {
  1585  		z.prec = umax32(x.prec, y.prec)
  1586  	}
  1587  
  1588  	z.neg = x.neg != y.neg
  1589  
  1590  	if x.form == finite && y.form == finite {
  1591  		// x / y (common case)
  1592  		z.uquo(x, y)
  1593  		return z
  1594  	}
  1595  
  1596  	z.acc = Exact
  1597  	if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
  1598  		// ±0 / ±0
  1599  		// ±Inf / ±Inf
  1600  		// value of z is undefined but make sure it's valid
  1601  		z.form = zero
  1602  		z.neg = false
  1603  		panic(ErrNaN{"division of zero by zero or infinity by infinity"})
  1604  	}
  1605  
  1606  	if x.form == zero || y.form == inf {
  1607  		// ±0 / y
  1608  		// x / ±Inf
  1609  		z.form = zero
  1610  		return z
  1611  	}
  1612  
  1613  	// x / ±0
  1614  	// ±Inf / y
  1615  	z.form = inf
  1616  	return z
  1617  }
  1618  
  1619  // Cmp compares x and y and returns:
  1620  //
  1621  //   -1 if x <  y
  1622  //    0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf)
  1623  //   +1 if x >  y
  1624  //
  1625  func (x *Float) Cmp(y *Float) int {
  1626  	if debugFloat {
  1627  		x.validate()
  1628  		y.validate()
  1629  	}
  1630  
  1631  	mx := x.ord()
  1632  	my := y.ord()
  1633  	switch {
  1634  	case mx < my:
  1635  		return -1
  1636  	case mx > my:
  1637  		return +1
  1638  	}
  1639  	// mx == my
  1640  
  1641  	// only if |mx| == 1 we have to compare the mantissae
  1642  	switch mx {
  1643  	case -1:
  1644  		return y.ucmp(x)
  1645  	case +1:
  1646  		return x.ucmp(y)
  1647  	}
  1648  
  1649  	return 0
  1650  }
  1651  
  1652  // ord classifies x and returns:
  1653  //
  1654  //	-2 if -Inf == x
  1655  //	-1 if -Inf < x < 0
  1656  //	 0 if x == 0 (signed or unsigned)
  1657  //	+1 if 0 < x < +Inf
  1658  //	+2 if x == +Inf
  1659  //
  1660  func (x *Float) ord() int {
  1661  	var m int
  1662  	switch x.form {
  1663  	case finite:
  1664  		m = 1
  1665  	case zero:
  1666  		return 0
  1667  	case inf:
  1668  		m = 2
  1669  	}
  1670  	if x.neg {
  1671  		m = -m
  1672  	}
  1673  	return m
  1674  }
  1675  
  1676  func umax32(x, y uint32) uint32 {
  1677  	if x > y {
  1678  		return x
  1679  	}
  1680  	return y
  1681  }