github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/data.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/data.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package roachpb
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"encoding/binary"
    17  	"encoding/hex"
    18  	"fmt"
    19  	"hash"
    20  	"hash/crc32"
    21  	"math"
    22  	"math/rand"
    23  	"sort"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/cockroachdb/apd"
    30  	"github.com/cockroachdb/cockroach/pkg/geo/geopb"
    31  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
    32  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    33  	"github.com/cockroachdb/cockroach/pkg/util"
    34  	"github.com/cockroachdb/cockroach/pkg/util/bitarray"
    35  	"github.com/cockroachdb/cockroach/pkg/util/duration"
    36  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    37  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    38  	"github.com/cockroachdb/cockroach/pkg/util/interval"
    39  	"github.com/cockroachdb/cockroach/pkg/util/log"
    40  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/timetz"
    42  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    43  	"github.com/cockroachdb/errors"
    44  	"go.etcd.io/etcd/raft/raftpb"
    45  )
    46  
    47  var (
    48  	// RKeyMin is a minimum key value which sorts before all other keys.
    49  	RKeyMin = RKey("")
    50  	// KeyMin is a minimum key value which sorts before all other keys.
    51  	KeyMin = Key(RKeyMin)
    52  	// RKeyMax is a maximum key value which sorts after all other keys.
    53  	RKeyMax = RKey{0xff, 0xff}
    54  	// KeyMax is a maximum key value which sorts after all other keys.
    55  	KeyMax = Key(RKeyMax)
    56  
    57  	// PrettyPrintKey prints a key in human readable format. It's
    58  	// implemented in package git.com/cockroachdb/cockroach/keys to avoid
    59  	// package circle import.
    60  	// valDirs correspond to the encoding direction of each encoded value
    61  	// in the key (if known). If left unspecified, the default encoding
    62  	// direction for each value type is used (see
    63  	// encoding.go:prettyPrintFirstValue).
    64  	PrettyPrintKey func(valDirs []encoding.Direction, key Key) string
    65  
    66  	// PrettyPrintRange prints a key range in human readable format. It's
    67  	// implemented in package git.com/cockroachdb/cockroach/keys to avoid
    68  	// package circle import.
    69  	PrettyPrintRange func(start, end Key, maxChars int) string
    70  )
    71  
    72  // RKey denotes a Key whose local addressing has been accounted for.
    73  // A key can be transformed to an RKey by keys.Addr().
    74  //
    75  // RKey stands for "resolved key," as in a key whose address has been resolved.
    76  type RKey Key
    77  
    78  // AsRawKey returns the RKey as a Key. This is to be used only in select
    79  // situations in which an RKey is known to not contain a wrapped locally-
    80  // addressed Key. That is, it must only be used when the original Key was not a
    81  // local key. Whenever the Key which created the RKey is still available, it
    82  // should be used instead.
    83  func (rk RKey) AsRawKey() Key {
    84  	return Key(rk)
    85  }
    86  
    87  // Less compares two RKeys.
    88  func (rk RKey) Less(otherRK RKey) bool {
    89  	return bytes.Compare(rk, otherRK) < 0
    90  }
    91  
    92  // Equal checks for byte-wise equality.
    93  func (rk RKey) Equal(other []byte) bool {
    94  	return bytes.Equal(rk, other)
    95  }
    96  
    97  // Next returns the RKey that sorts immediately after the given one.
    98  // The method may only take a shallow copy of the RKey, so both the
    99  // receiver and the return value should be treated as immutable after.
   100  func (rk RKey) Next() RKey {
   101  	return RKey(BytesNext(rk))
   102  }
   103  
   104  // PrefixEnd determines the end key given key as a prefix, that is the
   105  // key that sorts precisely behind all keys starting with prefix: "1"
   106  // is added to the final byte and the carry propagated. The special
   107  // cases of nil and KeyMin always returns KeyMax.
   108  func (rk RKey) PrefixEnd() RKey {
   109  	if len(rk) == 0 {
   110  		return RKeyMax
   111  	}
   112  	return RKey(bytesPrefixEnd(rk))
   113  }
   114  
   115  func (rk RKey) String() string {
   116  	return Key(rk).String()
   117  }
   118  
   119  // StringWithDirs - see Key.String.WithDirs.
   120  func (rk RKey) StringWithDirs(valDirs []encoding.Direction, maxLen int) string {
   121  	return Key(rk).StringWithDirs(valDirs, maxLen)
   122  }
   123  
   124  // Key is a custom type for a byte string in proto
   125  // messages which refer to Cockroach keys.
   126  type Key []byte
   127  
   128  // BytesNext returns the next possible byte slice, using the extra capacity
   129  // of the provided slice if possible, and if not, appending an \x00.
   130  func BytesNext(b []byte) []byte {
   131  	if cap(b) > len(b) {
   132  		bNext := b[:len(b)+1]
   133  		if bNext[len(bNext)-1] == 0 {
   134  			return bNext
   135  		}
   136  	}
   137  	// TODO(spencer): Do we need to enforce KeyMaxLength here?
   138  	// Switched to "make and copy" pattern in #4963 for performance.
   139  	bn := make([]byte, len(b)+1)
   140  	copy(bn, b)
   141  	bn[len(bn)-1] = 0
   142  	return bn
   143  }
   144  
   145  func bytesPrefixEnd(b []byte) []byte {
   146  	// Switched to "make and copy" pattern in #4963 for performance.
   147  	end := make([]byte, len(b))
   148  	copy(end, b)
   149  	for i := len(end) - 1; i >= 0; i-- {
   150  		end[i] = end[i] + 1
   151  		if end[i] != 0 {
   152  			return end[:i+1]
   153  		}
   154  	}
   155  	// This statement will only be reached if the key is already a
   156  	// maximal byte string (i.e. already \xff...).
   157  	return b
   158  }
   159  
   160  // Next returns the next key in lexicographic sort order. The method may only
   161  // take a shallow copy of the Key, so both the receiver and the return
   162  // value should be treated as immutable after.
   163  func (k Key) Next() Key {
   164  	return Key(BytesNext(k))
   165  }
   166  
   167  // IsPrev is a more efficient version of k.Next().Equal(m).
   168  func (k Key) IsPrev(m Key) bool {
   169  	l := len(m) - 1
   170  	return l == len(k) && m[l] == 0 && k.Equal(m[:l])
   171  }
   172  
   173  // PrefixEnd determines the end key given key as a prefix, that is the
   174  // key that sorts precisely behind all keys starting with prefix: "1"
   175  // is added to the final byte and the carry propagated. The special
   176  // cases of nil and KeyMin always returns KeyMax.
   177  func (k Key) PrefixEnd() Key {
   178  	if len(k) == 0 {
   179  		return Key(RKeyMax)
   180  	}
   181  	return Key(bytesPrefixEnd(k))
   182  }
   183  
   184  // Equal returns whether two keys are identical.
   185  func (k Key) Equal(l Key) bool {
   186  	return bytes.Equal(k, l)
   187  }
   188  
   189  // Compare compares the two Keys.
   190  func (k Key) Compare(b Key) int {
   191  	return bytes.Compare(k, b)
   192  }
   193  
   194  // String returns a string-formatted version of the key.
   195  func (k Key) String() string {
   196  	return k.StringWithDirs(nil /* valDirs */, 0 /* maxLen */)
   197  }
   198  
   199  // StringWithDirs is the value encoding direction-aware version of String.
   200  //
   201  // Args:
   202  // valDirs: The direction for the key's components, generally needed for correct
   203  // 	decoding. If nil, the values are pretty-printed with default encoding
   204  // 	direction.
   205  // maxLen: If not 0, only the first maxLen chars from the decoded key are
   206  //   returned, plus a "..." suffix.
   207  func (k Key) StringWithDirs(valDirs []encoding.Direction, maxLen int) string {
   208  	var s string
   209  	if PrettyPrintKey != nil {
   210  		s = PrettyPrintKey(valDirs, k)
   211  	} else {
   212  		s = fmt.Sprintf("%q", []byte(k))
   213  	}
   214  	if maxLen != 0 && len(s) > maxLen {
   215  		return s[0:maxLen] + "..."
   216  	}
   217  	return s
   218  }
   219  
   220  // Format implements the fmt.Formatter interface.
   221  func (k Key) Format(f fmt.State, verb rune) {
   222  	// Note: this implementation doesn't handle the width and precision
   223  	// specifiers such as "%20.10s".
   224  	if verb == 'x' {
   225  		fmt.Fprintf(f, "%x", []byte(k))
   226  	} else if PrettyPrintKey != nil {
   227  		fmt.Fprint(f, PrettyPrintKey(nil /* valDirs */, k))
   228  	} else {
   229  		fmt.Fprint(f, strconv.Quote(string(k)))
   230  	}
   231  }
   232  
   233  const (
   234  	checksumUninitialized = 0
   235  	checksumSize          = 4
   236  	tagPos                = checksumSize
   237  	headerSize            = tagPos + 1
   238  )
   239  
   240  func (v Value) checksum() uint32 {
   241  	if len(v.RawBytes) < checksumSize {
   242  		return 0
   243  	}
   244  	_, u, err := encoding.DecodeUint32Ascending(v.RawBytes[:checksumSize])
   245  	if err != nil {
   246  		panic(err)
   247  	}
   248  	return u
   249  }
   250  
   251  func (v *Value) setChecksum(cksum uint32) {
   252  	if len(v.RawBytes) >= checksumSize {
   253  		encoding.EncodeUint32Ascending(v.RawBytes[:0], cksum)
   254  	}
   255  }
   256  
   257  // InitChecksum initializes a checksum based on the provided key and
   258  // the contents of the value. If the value contains a byte slice, the
   259  // checksum includes it directly.
   260  //
   261  // TODO(peter): This method should return an error if the Value is corrupted
   262  // (e.g. the RawBytes field is > 0 but smaller than the header size).
   263  func (v *Value) InitChecksum(key []byte) {
   264  	if v.RawBytes == nil {
   265  		return
   266  	}
   267  	// Should be uninitialized.
   268  	if v.checksum() != checksumUninitialized {
   269  		panic(fmt.Sprintf("initialized checksum = %x", v.checksum()))
   270  	}
   271  	v.setChecksum(v.computeChecksum(key))
   272  }
   273  
   274  // ClearChecksum clears the checksum value.
   275  func (v *Value) ClearChecksum() {
   276  	v.setChecksum(0)
   277  }
   278  
   279  // Verify verifies the value's Checksum matches a newly-computed
   280  // checksum of the value's contents. If the value's Checksum is not
   281  // set the verification is a noop.
   282  func (v Value) Verify(key []byte) error {
   283  	if n := len(v.RawBytes); n > 0 && n < headerSize {
   284  		return fmt.Errorf("%s: invalid header size: %d", Key(key), n)
   285  	}
   286  	if sum := v.checksum(); sum != 0 {
   287  		if computedSum := v.computeChecksum(key); computedSum != sum {
   288  			return fmt.Errorf("%s: invalid checksum (%x) value [% x]",
   289  				Key(key), computedSum, v.RawBytes)
   290  		}
   291  	}
   292  	return nil
   293  }
   294  
   295  // ShallowClone returns a shallow clone of the receiver.
   296  func (v *Value) ShallowClone() *Value {
   297  	if v == nil {
   298  		return nil
   299  	}
   300  	t := *v
   301  	return &t
   302  }
   303  
   304  // IsPresent returns true if the value is present (existent and not a tombstone).
   305  func (v *Value) IsPresent() bool {
   306  	return v != nil && len(v.RawBytes) != 0
   307  }
   308  
   309  // MakeValueFromString returns a value with bytes and tag set.
   310  func MakeValueFromString(s string) Value {
   311  	v := Value{}
   312  	v.SetString(s)
   313  	return v
   314  }
   315  
   316  // MakeValueFromBytes returns a value with bytes and tag set.
   317  func MakeValueFromBytes(bs []byte) Value {
   318  	v := Value{}
   319  	v.SetBytes(bs)
   320  	return v
   321  }
   322  
   323  // MakeValueFromBytesAndTimestamp returns a value with bytes, timestamp and
   324  // tag set.
   325  func MakeValueFromBytesAndTimestamp(bs []byte, t hlc.Timestamp) Value {
   326  	v := Value{Timestamp: t}
   327  	v.SetBytes(bs)
   328  	return v
   329  }
   330  
   331  // GetTag retrieves the value type.
   332  func (v Value) GetTag() ValueType {
   333  	if len(v.RawBytes) <= tagPos {
   334  		return ValueType_UNKNOWN
   335  	}
   336  	return ValueType(v.RawBytes[tagPos])
   337  }
   338  
   339  func (v *Value) setTag(t ValueType) {
   340  	v.RawBytes[tagPos] = byte(t)
   341  }
   342  
   343  func (v Value) dataBytes() []byte {
   344  	return v.RawBytes[headerSize:]
   345  }
   346  
   347  func (v *Value) ensureRawBytes(size int) {
   348  	if cap(v.RawBytes) < size {
   349  		v.RawBytes = make([]byte, size)
   350  		return
   351  	}
   352  	v.RawBytes = v.RawBytes[:size]
   353  	v.setChecksum(checksumUninitialized)
   354  }
   355  
   356  // EqualData returns a boolean reporting whether the receiver and the parameter
   357  // have equivalent byte values. This check ignores the optional checksum field
   358  // in the Values' byte slices, returning only whether the Values have the same
   359  // tag and encoded data.
   360  //
   361  // This method should be used whenever the raw bytes of two Values are being
   362  // compared instead of comparing the RawBytes slices directly because it ignores
   363  // the checksum header, which is optional.
   364  func (v Value) EqualData(o Value) bool {
   365  	return bytes.Equal(v.RawBytes[checksumSize:], o.RawBytes[checksumSize:])
   366  }
   367  
   368  // SetBytes sets the bytes and tag field of the receiver and clears the checksum.
   369  func (v *Value) SetBytes(b []byte) {
   370  	v.ensureRawBytes(headerSize + len(b))
   371  	copy(v.dataBytes(), b)
   372  	v.setTag(ValueType_BYTES)
   373  }
   374  
   375  // SetString sets the bytes and tag field of the receiver and clears the
   376  // checksum. This is identical to SetBytes, but specialized for a string
   377  // argument.
   378  func (v *Value) SetString(s string) {
   379  	v.ensureRawBytes(headerSize + len(s))
   380  	copy(v.dataBytes(), s)
   381  	v.setTag(ValueType_BYTES)
   382  }
   383  
   384  // SetFloat encodes the specified float64 value into the bytes field of the
   385  // receiver, sets the tag and clears the checksum.
   386  func (v *Value) SetFloat(f float64) {
   387  	v.ensureRawBytes(headerSize + 8)
   388  	encoding.EncodeUint64Ascending(v.RawBytes[headerSize:headerSize], math.Float64bits(f))
   389  	v.setTag(ValueType_FLOAT)
   390  }
   391  
   392  // SetGeo encodes the specified geo value into the bytes field of the
   393  // receiver, sets the tag and clears the checksum.
   394  func (v *Value) SetGeo(so geopb.SpatialObject) error {
   395  	bytes, err := protoutil.Marshal(&so)
   396  	if err != nil {
   397  		return err
   398  	}
   399  	v.ensureRawBytes(headerSize + len(bytes))
   400  	copy(v.dataBytes(), bytes)
   401  	v.setTag(ValueType_GEO)
   402  	return nil
   403  }
   404  
   405  // SetBool encodes the specified bool value into the bytes field of the
   406  // receiver, sets the tag and clears the checksum.
   407  func (v *Value) SetBool(b bool) {
   408  	// 0 or 1 will always encode to a 1-byte long varint.
   409  	v.ensureRawBytes(headerSize + 1)
   410  	i := int64(0)
   411  	if b {
   412  		i = 1
   413  	}
   414  	_ = binary.PutVarint(v.RawBytes[headerSize:], i)
   415  	v.setTag(ValueType_INT)
   416  }
   417  
   418  // SetInt encodes the specified int64 value into the bytes field of the
   419  // receiver, sets the tag and clears the checksum.
   420  func (v *Value) SetInt(i int64) {
   421  	v.ensureRawBytes(headerSize + binary.MaxVarintLen64)
   422  	n := binary.PutVarint(v.RawBytes[headerSize:], i)
   423  	v.RawBytes = v.RawBytes[:headerSize+n]
   424  	v.setTag(ValueType_INT)
   425  }
   426  
   427  // SetProto encodes the specified proto message into the bytes field of the
   428  // receiver and clears the checksum. If the proto message is an
   429  // InternalTimeSeriesData, the tag will be set to TIMESERIES rather than BYTES.
   430  func (v *Value) SetProto(msg protoutil.Message) error {
   431  	// All of the Cockroach protos implement MarshalTo and Size. So we marshal
   432  	// directly into the Value.RawBytes field instead of allocating a separate
   433  	// []byte and copying.
   434  	v.ensureRawBytes(headerSize + msg.Size())
   435  	if _, err := protoutil.MarshalTo(msg, v.RawBytes[headerSize:]); err != nil {
   436  		return err
   437  	}
   438  	// Special handling for timeseries data.
   439  	if _, ok := msg.(*InternalTimeSeriesData); ok {
   440  		v.setTag(ValueType_TIMESERIES)
   441  	} else {
   442  		v.setTag(ValueType_BYTES)
   443  	}
   444  	return nil
   445  }
   446  
   447  // SetTime encodes the specified time value into the bytes field of the
   448  // receiver, sets the tag and clears the checksum.
   449  func (v *Value) SetTime(t time.Time) {
   450  	const encodingSizeOverestimate = 11
   451  	v.ensureRawBytes(headerSize + encodingSizeOverestimate)
   452  	v.RawBytes = encoding.EncodeTimeAscending(v.RawBytes[:headerSize], t)
   453  	v.setTag(ValueType_TIME)
   454  }
   455  
   456  // SetTimeTZ encodes the specified time value into the bytes field of the
   457  // receiver, sets the tag and clears the checksum.
   458  func (v *Value) SetTimeTZ(t timetz.TimeTZ) {
   459  	v.ensureRawBytes(headerSize + encoding.EncodedTimeTZMaxLen)
   460  	v.RawBytes = encoding.EncodeTimeTZAscending(v.RawBytes[:headerSize], t)
   461  	v.setTag(ValueType_TIMETZ)
   462  }
   463  
   464  // SetDuration encodes the specified duration value into the bytes field of the
   465  // receiver, sets the tag and clears the checksum.
   466  func (v *Value) SetDuration(t duration.Duration) error {
   467  	var err error
   468  	v.ensureRawBytes(headerSize + encoding.EncodedDurationMaxLen)
   469  	v.RawBytes, err = encoding.EncodeDurationAscending(v.RawBytes[:headerSize], t)
   470  	if err != nil {
   471  		return err
   472  	}
   473  	v.setTag(ValueType_DURATION)
   474  	return nil
   475  }
   476  
   477  // SetBitArray encodes the specified bit array value into the bytes field of the
   478  // receiver, sets the tag and clears the checksum.
   479  func (v *Value) SetBitArray(t bitarray.BitArray) {
   480  	words, _ := t.EncodingParts()
   481  	v.ensureRawBytes(headerSize + encoding.NonsortingUvarintMaxLen + 8*len(words))
   482  	v.RawBytes = encoding.EncodeUntaggedBitArrayValue(v.RawBytes[:headerSize], t)
   483  	v.setTag(ValueType_BITARRAY)
   484  }
   485  
   486  // SetDecimal encodes the specified decimal value into the bytes field of
   487  // the receiver using Gob encoding, sets the tag and clears the checksum.
   488  func (v *Value) SetDecimal(dec *apd.Decimal) error {
   489  	decSize := encoding.UpperBoundNonsortingDecimalSize(dec)
   490  	v.ensureRawBytes(headerSize + decSize)
   491  	v.RawBytes = encoding.EncodeNonsortingDecimal(v.RawBytes[:headerSize], dec)
   492  	v.setTag(ValueType_DECIMAL)
   493  	return nil
   494  }
   495  
   496  // SetTuple sets the tuple bytes and tag field of the receiver and clears the
   497  // checksum.
   498  func (v *Value) SetTuple(data []byte) {
   499  	v.ensureRawBytes(headerSize + len(data))
   500  	copy(v.dataBytes(), data)
   501  	v.setTag(ValueType_TUPLE)
   502  }
   503  
   504  // GetBytes returns the bytes field of the receiver. If the tag is not
   505  // BYTES an error will be returned.
   506  func (v Value) GetBytes() ([]byte, error) {
   507  	if tag := v.GetTag(); tag != ValueType_BYTES {
   508  		return nil, fmt.Errorf("value type is not %s: %s", ValueType_BYTES, tag)
   509  	}
   510  	return v.dataBytes(), nil
   511  }
   512  
   513  // GetFloat decodes a float64 value from the bytes field of the receiver. If
   514  // the bytes field is not 8 bytes in length or the tag is not FLOAT an error
   515  // will be returned.
   516  func (v Value) GetFloat() (float64, error) {
   517  	if tag := v.GetTag(); tag != ValueType_FLOAT {
   518  		return 0, fmt.Errorf("value type is not %s: %s", ValueType_FLOAT, tag)
   519  	}
   520  	dataBytes := v.dataBytes()
   521  	if len(dataBytes) != 8 {
   522  		return 0, fmt.Errorf("float64 value should be exactly 8 bytes: %d", len(dataBytes))
   523  	}
   524  	_, u, err := encoding.DecodeUint64Ascending(dataBytes)
   525  	if err != nil {
   526  		return 0, err
   527  	}
   528  	return math.Float64frombits(u), nil
   529  }
   530  
   531  // GetGeo decodes a geo value from the bytes field of the receiver. If the
   532  // tag is not GEO an error will be returned.
   533  func (v Value) GetGeo() (geopb.SpatialObject, error) {
   534  	if tag := v.GetTag(); tag != ValueType_GEO {
   535  		return geopb.SpatialObject{}, fmt.Errorf("value type is not %s: %s", ValueType_GEO, tag)
   536  	}
   537  	var ret geopb.SpatialObject
   538  	err := protoutil.Unmarshal(v.dataBytes(), &ret)
   539  	return ret, err
   540  }
   541  
   542  // GetBool decodes a bool value from the bytes field of the receiver. If the
   543  // tag is not INT (the tag used for bool values) or the value cannot be decoded
   544  // an error will be returned.
   545  func (v Value) GetBool() (bool, error) {
   546  	if tag := v.GetTag(); tag != ValueType_INT {
   547  		return false, fmt.Errorf("value type is not %s: %s", ValueType_INT, tag)
   548  	}
   549  	i, n := binary.Varint(v.dataBytes())
   550  	if n <= 0 {
   551  		return false, fmt.Errorf("int64 varint decoding failed: %d", n)
   552  	}
   553  	if i > 1 || i < 0 {
   554  		return false, fmt.Errorf("invalid bool: %d", i)
   555  	}
   556  	return i != 0, nil
   557  }
   558  
   559  // GetInt decodes an int64 value from the bytes field of the receiver. If the
   560  // tag is not INT or the value cannot be decoded an error will be returned.
   561  func (v Value) GetInt() (int64, error) {
   562  	if tag := v.GetTag(); tag != ValueType_INT {
   563  		return 0, fmt.Errorf("value type is not %s: %s", ValueType_INT, tag)
   564  	}
   565  	i, n := binary.Varint(v.dataBytes())
   566  	if n <= 0 {
   567  		return 0, fmt.Errorf("int64 varint decoding failed: %d", n)
   568  	}
   569  	return i, nil
   570  }
   571  
   572  // GetProto unmarshals the bytes field of the receiver into msg. If
   573  // unmarshalling fails or the tag is not BYTES, an error will be
   574  // returned.
   575  func (v Value) GetProto(msg protoutil.Message) error {
   576  	expectedTag := ValueType_BYTES
   577  
   578  	// Special handling for ts data.
   579  	if _, ok := msg.(*InternalTimeSeriesData); ok {
   580  		expectedTag = ValueType_TIMESERIES
   581  	}
   582  
   583  	if tag := v.GetTag(); tag != expectedTag {
   584  		return fmt.Errorf("value type is not %s: %s", expectedTag, tag)
   585  	}
   586  	return protoutil.Unmarshal(v.dataBytes(), msg)
   587  }
   588  
   589  // GetTime decodes a time value from the bytes field of the receiver. If the
   590  // tag is not TIME an error will be returned.
   591  func (v Value) GetTime() (time.Time, error) {
   592  	if tag := v.GetTag(); tag != ValueType_TIME {
   593  		return time.Time{}, fmt.Errorf("value type is not %s: %s", ValueType_TIME, tag)
   594  	}
   595  	_, t, err := encoding.DecodeTimeAscending(v.dataBytes())
   596  	return t, err
   597  }
   598  
   599  // GetTimeTZ decodes a time value from the bytes field of the receiver. If the
   600  // tag is not TIMETZ an error will be returned.
   601  func (v Value) GetTimeTZ() (timetz.TimeTZ, error) {
   602  	if tag := v.GetTag(); tag != ValueType_TIMETZ {
   603  		return timetz.TimeTZ{}, fmt.Errorf("value type is not %s: %s", ValueType_TIMETZ, tag)
   604  	}
   605  	_, t, err := encoding.DecodeTimeTZAscending(v.dataBytes())
   606  	return t, err
   607  }
   608  
   609  // GetDuration decodes a duration value from the bytes field of the receiver. If
   610  // the tag is not DURATION an error will be returned.
   611  func (v Value) GetDuration() (duration.Duration, error) {
   612  	if tag := v.GetTag(); tag != ValueType_DURATION {
   613  		return duration.Duration{}, fmt.Errorf("value type is not %s: %s", ValueType_DURATION, tag)
   614  	}
   615  	_, t, err := encoding.DecodeDurationAscending(v.dataBytes())
   616  	return t, err
   617  }
   618  
   619  // GetBitArray decodes a bit array value from the bytes field of the receiver. If
   620  // the tag is not BITARRAY an error will be returned.
   621  func (v Value) GetBitArray() (bitarray.BitArray, error) {
   622  	if tag := v.GetTag(); tag != ValueType_BITARRAY {
   623  		return bitarray.BitArray{}, fmt.Errorf("value type is not %s: %s", ValueType_BITARRAY, tag)
   624  	}
   625  	_, t, err := encoding.DecodeUntaggedBitArrayValue(v.dataBytes())
   626  	return t, err
   627  }
   628  
   629  // GetDecimal decodes a decimal value from the bytes of the receiver. If the
   630  // tag is not DECIMAL an error will be returned.
   631  func (v Value) GetDecimal() (apd.Decimal, error) {
   632  	if tag := v.GetTag(); tag != ValueType_DECIMAL {
   633  		return apd.Decimal{}, fmt.Errorf("value type is not %s: %s", ValueType_DECIMAL, tag)
   634  	}
   635  	return encoding.DecodeNonsortingDecimal(v.dataBytes(), nil)
   636  }
   637  
   638  // GetDecimalInto decodes a decimal value from the bytes of the receiver,
   639  // writing it directly into the provided non-null apd.Decimal. If the
   640  // tag is not DECIMAL an error will be returned.
   641  func (v Value) GetDecimalInto(d *apd.Decimal) error {
   642  	if tag := v.GetTag(); tag != ValueType_DECIMAL {
   643  		return fmt.Errorf("value type is not %s: %s", ValueType_DECIMAL, tag)
   644  	}
   645  	return encoding.DecodeIntoNonsortingDecimal(d, v.dataBytes(), nil)
   646  }
   647  
   648  // GetTimeseries decodes an InternalTimeSeriesData value from the bytes
   649  // field of the receiver. An error will be returned if the tag is not
   650  // TIMESERIES or if decoding fails.
   651  func (v Value) GetTimeseries() (InternalTimeSeriesData, error) {
   652  	ts := InternalTimeSeriesData{}
   653  	// GetProto mutates its argument. `return ts, v.GetProto(&ts)`
   654  	// happens to work in gc, but does not work in gccgo.
   655  	//
   656  	// See https://github.com/golang/go/issues/23188.
   657  	err := v.GetProto(&ts)
   658  	return ts, err
   659  }
   660  
   661  // GetTuple returns the tuple bytes of the receiver. If the tag is not TUPLE an
   662  // error will be returned.
   663  func (v Value) GetTuple() ([]byte, error) {
   664  	if tag := v.GetTag(); tag != ValueType_TUPLE {
   665  		return nil, fmt.Errorf("value type is not %s: %s", ValueType_TUPLE, tag)
   666  	}
   667  	return v.dataBytes(), nil
   668  }
   669  
   670  var crc32Pool = sync.Pool{
   671  	New: func() interface{} {
   672  		return crc32.NewIEEE()
   673  	},
   674  }
   675  
   676  func computeChecksum(key, rawBytes []byte, crc hash.Hash32) uint32 {
   677  	if len(rawBytes) < headerSize {
   678  		return 0
   679  	}
   680  	if _, err := crc.Write(key); err != nil {
   681  		panic(err)
   682  	}
   683  	if _, err := crc.Write(rawBytes[checksumSize:]); err != nil {
   684  		panic(err)
   685  	}
   686  	sum := crc.Sum32()
   687  	crc.Reset()
   688  	// We reserved the value 0 (checksumUninitialized) to indicate that a checksum
   689  	// has not been initialized. This reservation is accomplished by folding a
   690  	// computed checksum of 0 to the value 1.
   691  	if sum == checksumUninitialized {
   692  		return 1
   693  	}
   694  	return sum
   695  }
   696  
   697  // computeChecksum computes a checksum based on the provided key and
   698  // the contents of the value.
   699  func (v Value) computeChecksum(key []byte) uint32 {
   700  	crc := crc32Pool.Get().(hash.Hash32)
   701  	sum := computeChecksum(key, v.RawBytes, crc)
   702  	crc32Pool.Put(crc)
   703  	return sum
   704  }
   705  
   706  // PrettyPrint returns the value in a human readable format.
   707  // e.g. `Put /Table/51/1/1/0 -> /TUPLE/2:2:Int/7/1:3:Float/6.28`
   708  // In `1:3:Float/6.28`, the `1` is the column id diff as stored, `3` is the
   709  // computed (i.e. not stored) actual column id, `Float` is the type, and `6.28`
   710  // is the encoded value.
   711  func (v Value) PrettyPrint() string {
   712  	if len(v.RawBytes) == 0 {
   713  		return "/<empty>"
   714  	}
   715  	var buf bytes.Buffer
   716  	t := v.GetTag()
   717  	buf.WriteRune('/')
   718  	buf.WriteString(t.String())
   719  	buf.WriteRune('/')
   720  
   721  	var err error
   722  	switch t {
   723  	case ValueType_TUPLE:
   724  		b := v.dataBytes()
   725  		var colID uint32
   726  		for i := 0; len(b) > 0; i++ {
   727  			if i != 0 {
   728  				buf.WriteRune('/')
   729  			}
   730  			_, _, colIDDiff, typ, err := encoding.DecodeValueTag(b)
   731  			if err != nil {
   732  				break
   733  			}
   734  			colID += colIDDiff
   735  			var s string
   736  			b, s, err = encoding.PrettyPrintValueEncoded(b)
   737  			if err != nil {
   738  				break
   739  			}
   740  			fmt.Fprintf(&buf, "%d:%d:%s/%s", colIDDiff, colID, typ, s)
   741  		}
   742  	case ValueType_INT:
   743  		var i int64
   744  		i, err = v.GetInt()
   745  		buf.WriteString(strconv.FormatInt(i, 10))
   746  	case ValueType_FLOAT:
   747  		var f float64
   748  		f, err = v.GetFloat()
   749  		buf.WriteString(strconv.FormatFloat(f, 'g', -1, 64))
   750  	case ValueType_BYTES:
   751  		var data []byte
   752  		data, err = v.GetBytes()
   753  		if encoding.PrintableBytes(data) {
   754  			buf.WriteString(string(data))
   755  		} else {
   756  			buf.WriteString("0x")
   757  			buf.WriteString(hex.EncodeToString(data))
   758  		}
   759  	case ValueType_BITARRAY:
   760  		var data bitarray.BitArray
   761  		data, err = v.GetBitArray()
   762  		buf.WriteByte('B')
   763  		data.Format(&buf)
   764  	case ValueType_TIME:
   765  		var t time.Time
   766  		t, err = v.GetTime()
   767  		buf.WriteString(t.UTC().Format(time.RFC3339Nano))
   768  	case ValueType_DECIMAL:
   769  		var d apd.Decimal
   770  		d, err = v.GetDecimal()
   771  		buf.WriteString(d.String())
   772  	case ValueType_DURATION:
   773  		var d duration.Duration
   774  		d, err = v.GetDuration()
   775  		buf.WriteString(d.StringNanos())
   776  	default:
   777  		err = errors.Errorf("unknown tag: %s", t)
   778  	}
   779  	if err != nil {
   780  		// Ignore the contents of buf and return directly.
   781  		return fmt.Sprintf("/<err: %s>", err)
   782  	}
   783  	return buf.String()
   784  }
   785  
   786  // IsFinalized determines whether the transaction status is in a finalized
   787  // state. A finalized state is terminal, meaning that once a transaction
   788  // enters one of these states, it will never leave it.
   789  func (ts TransactionStatus) IsFinalized() bool {
   790  	return ts == COMMITTED || ts == ABORTED
   791  }
   792  
   793  // IsCommittedOrStaging determines if the transaction is morally committed (i.e.
   794  // in the COMMITTED or STAGING state).
   795  func (ts TransactionStatus) IsCommittedOrStaging() bool {
   796  	return ts == COMMITTED || ts == STAGING
   797  }
   798  
   799  var _ errors.SafeMessager = Transaction{}
   800  
   801  // MakeTransaction creates a new transaction. The transaction key is
   802  // composed using the specified baseKey (for locality with data
   803  // affected by the transaction) and a random ID to guarantee
   804  // uniqueness. The specified user-level priority is combined with a
   805  // randomly chosen value to yield a final priority, used to settle
   806  // write conflicts in a way that avoids starvation of long-running
   807  // transactions (see Replica.PushTxn).
   808  //
   809  // baseKey can be nil, in which case it will be set when sending the first
   810  // write.
   811  func MakeTransaction(
   812  	name string, baseKey Key, userPriority UserPriority, now hlc.Timestamp, maxOffsetNs int64,
   813  ) Transaction {
   814  	u := uuid.FastMakeV4()
   815  	maxTS := now.Add(maxOffsetNs, 0)
   816  
   817  	return Transaction{
   818  		TxnMeta: enginepb.TxnMeta{
   819  			Key:            baseKey,
   820  			ID:             u,
   821  			WriteTimestamp: now,
   822  			MinTimestamp:   now,
   823  			Priority:       MakePriority(userPriority),
   824  			Sequence:       0, // 1-indexed, incremented before each Request
   825  		},
   826  		Name:                    name,
   827  		LastHeartbeat:           now,
   828  		ReadTimestamp:           now,
   829  		MaxTimestamp:            maxTS,
   830  		DeprecatedOrigTimestamp: now, // For compatibility with 19.2.
   831  	}
   832  }
   833  
   834  // LastActive returns the last timestamp at which client activity definitely
   835  // occurred, i.e. the maximum of ReadTimestamp and LastHeartbeat.
   836  func (t Transaction) LastActive() hlc.Timestamp {
   837  	ts := t.LastHeartbeat
   838  	ts.Forward(t.ReadTimestamp)
   839  
   840  	// For compatibility with 19.2, handle the case where ReadTimestamp isn't
   841  	// set.
   842  	ts.Forward(t.DeprecatedOrigTimestamp)
   843  	return ts
   844  }
   845  
   846  // Clone creates a copy of the given transaction. The copy is shallow because
   847  // none of the references held by a transaction allow interior mutability.
   848  func (t Transaction) Clone() *Transaction {
   849  	return &t
   850  }
   851  
   852  // AssertInitialized crashes if the transaction is not initialized.
   853  func (t *Transaction) AssertInitialized(ctx context.Context) {
   854  	if t.ID == (uuid.UUID{}) || t.WriteTimestamp == (hlc.Timestamp{}) {
   855  		log.Fatalf(ctx, "uninitialized txn: %s", *t)
   856  	}
   857  }
   858  
   859  // MakePriority generates a random priority value, biased by the specified
   860  // userPriority. If userPriority=100, the random priority will be 100x more
   861  // likely to be greater than if userPriority=1. If userPriority = 0.1, the
   862  // random priority will be 1/10th as likely to be greater than if
   863  // userPriority=NormalUserPriority ( = 1). Balance is achieved when
   864  // userPriority=NormalUserPriority, in which case the priority chosen is
   865  // unbiased.
   866  //
   867  // If userPriority is less than or equal to MinUserPriority, returns
   868  // MinTxnPriority; if greater than or equal to MaxUserPriority, returns
   869  // MaxTxnPriority. If userPriority is 0, returns NormalUserPriority.
   870  func MakePriority(userPriority UserPriority) enginepb.TxnPriority {
   871  	// A currently undocumented feature allows an explicit priority to
   872  	// be set by specifying priority < 1. The explicit priority is
   873  	// simply -userPriority in this case. This is hacky, but currently
   874  	// used for unittesting. Perhaps this should be documented and allowed.
   875  	if userPriority < 0 {
   876  		if -userPriority > UserPriority(math.MaxInt32) {
   877  			panic(fmt.Sprintf("cannot set explicit priority to a value less than -%d", math.MaxInt32))
   878  		}
   879  		return enginepb.TxnPriority(-userPriority)
   880  	} else if userPriority == 0 {
   881  		userPriority = NormalUserPriority
   882  	} else if userPriority >= MaxUserPriority {
   883  		return enginepb.MaxTxnPriority
   884  	} else if userPriority <= MinUserPriority {
   885  		return enginepb.MinTxnPriority
   886  	}
   887  
   888  	// We generate random values which are biased according to priorities. If v1 is a value
   889  	// generated for priority p1 and v2 is a value of priority v2, we want the ratio of wins vs
   890  	// losses to be the same with the ratio of priorities:
   891  	//
   892  	//    P[ v1 > v2 ]     p1                                           p1
   893  	//    ------------  =  --     or, equivalently:    P[ v1 > v2 ] = -------
   894  	//    P[ v2 < v1 ]     p2                                         p1 + p2
   895  	//
   896  	//
   897  	// For example, priority 10 wins 10 out of 11 times over priority 1, and it wins 100 out of 101
   898  	// times over priority 0.1.
   899  	//
   900  	//
   901  	// We use the exponential distribution. This distribution has the probability density function
   902  	//   PDF_lambda(x) = lambda * exp(-lambda * x)
   903  	// and the cumulative distribution function (i.e. probability that a random value is smaller
   904  	// than x):
   905  	//   CDF_lambda(x) = Integral_0^x PDF_lambda(x) dx
   906  	//                 = 1 - exp(-lambda * x)
   907  	//
   908  	// Let's assume we generate x from the exponential distribution with the lambda rate set to
   909  	// l1 and we generate y from the distribution with the rate set to l2. The probability that x
   910  	// wins is:
   911  	//    P[ x > y ] = Integral_0^inf Integral_0^x PDF_l1(x) PDF_l2(y) dy dx
   912  	//               = Integral_0^inf PDF_l1(x) Integral_0^x PDF_l2(y) dy dx
   913  	//               = Integral_0^inf PDF_l1(x) CDF_l2(x) dx
   914  	//               = Integral_0^inf PDF_l1(x) (1 - exp(-l2 * x)) dx
   915  	//               = 1 - Integral_0^inf l1 * exp(-(l1+l2) * x) dx
   916  	//               = 1 - l1 / (l1 + l2) * Integral_0^inf PDF_(l1+l2)(x) dx
   917  	//               = 1 - l1 / (l1 + l2)
   918  	//               = l2 / (l1 + l2)
   919  	//
   920  	// We want this probability to be p1 / (p1 + p2) which we can get by setting
   921  	//    l1 = 1 / p1
   922  	//    l2 = 1 / p2
   923  	// It's easy to verify that (1/p2) / (1/p1 + 1/p2) = p1 / (p2 + p1).
   924  	//
   925  	// We can generate an exponentially distributed value using (rand.ExpFloat64() / lambda).
   926  	// In our case this works out to simply rand.ExpFloat64() * userPriority.
   927  	val := rand.ExpFloat64() * float64(userPriority)
   928  
   929  	// To convert to an integer, we scale things to accommodate a few (5) standard deviations for
   930  	// the maximum priority. The choice of the value is a trade-off between loss of resolution for
   931  	// low priorities and overflow (capping the value to MaxInt32) for high priorities.
   932  	//
   933  	// For userPriority=MaxUserPriority, the probability of overflow is 0.7%.
   934  	// For userPriority=(MaxUserPriority/2), the probability of overflow is 0.005%.
   935  	val = (val / (5 * float64(MaxUserPriority))) * math.MaxInt32
   936  	if val < float64(enginepb.MinTxnPriority+1) {
   937  		return enginepb.MinTxnPriority + 1
   938  	} else if val > float64(enginepb.MaxTxnPriority-1) {
   939  		return enginepb.MaxTxnPriority - 1
   940  	}
   941  	return enginepb.TxnPriority(val)
   942  }
   943  
   944  // Restart reconfigures a transaction for restart. The epoch is
   945  // incremented for an in-place restart. The timestamp of the
   946  // transaction on restart is set to the maximum of the transaction's
   947  // timestamp and the specified timestamp.
   948  func (t *Transaction) Restart(
   949  	userPriority UserPriority, upgradePriority enginepb.TxnPriority, timestamp hlc.Timestamp,
   950  ) {
   951  	t.BumpEpoch()
   952  	if t.WriteTimestamp.Less(timestamp) {
   953  		t.WriteTimestamp = timestamp
   954  	}
   955  	t.ReadTimestamp = t.WriteTimestamp
   956  	t.DeprecatedOrigTimestamp = t.WriteTimestamp // For 19.2 compatibility.
   957  	// Upgrade priority to the maximum of:
   958  	// - the current transaction priority
   959  	// - a random priority created from userPriority
   960  	// - the conflicting transaction's upgradePriority
   961  	t.UpgradePriority(MakePriority(userPriority))
   962  	t.UpgradePriority(upgradePriority)
   963  	// Reset all epoch-scoped state.
   964  	t.Sequence = 0
   965  	t.WriteTooOld = false
   966  	t.CommitTimestampFixed = false
   967  	t.LockSpans = nil
   968  	t.InFlightWrites = nil
   969  	t.IgnoredSeqNums = nil
   970  }
   971  
   972  // BumpEpoch increments the transaction's epoch, allowing for an in-place
   973  // restart. This invalidates all write intents previously written at lower
   974  // epochs.
   975  func (t *Transaction) BumpEpoch() {
   976  	t.Epoch++
   977  }
   978  
   979  // Update ratchets priority, timestamp and original timestamp values (among
   980  // others) for the transaction. If t.ID is empty, then the transaction is
   981  // copied from o.
   982  func (t *Transaction) Update(o *Transaction) {
   983  	if o == nil {
   984  		return
   985  	}
   986  	o.AssertInitialized(context.TODO())
   987  	if t.ID == (uuid.UUID{}) {
   988  		*t = *o
   989  		return
   990  	} else if t.ID != o.ID {
   991  		log.Fatalf(context.Background(), "updating txn %s with different txn %s", t.String(), o.String())
   992  		return
   993  	}
   994  	if len(t.Key) == 0 {
   995  		t.Key = o.Key
   996  	}
   997  
   998  	// Update epoch-scoped state, depending on the two transactions' epochs.
   999  	if t.Epoch < o.Epoch {
  1000  		// Replace all epoch-scoped state.
  1001  		t.Epoch = o.Epoch
  1002  		t.Status = o.Status
  1003  		t.WriteTooOld = o.WriteTooOld
  1004  		t.CommitTimestampFixed = o.CommitTimestampFixed
  1005  		t.Sequence = o.Sequence
  1006  		t.LockSpans = o.LockSpans
  1007  		t.InFlightWrites = o.InFlightWrites
  1008  		t.IgnoredSeqNums = o.IgnoredSeqNums
  1009  	} else if t.Epoch == o.Epoch {
  1010  		// Forward all epoch-scoped state.
  1011  		switch t.Status {
  1012  		case PENDING:
  1013  			t.Status = o.Status
  1014  		case STAGING:
  1015  			if o.Status != PENDING {
  1016  				t.Status = o.Status
  1017  			}
  1018  		case ABORTED:
  1019  			if o.Status == COMMITTED {
  1020  				log.Warningf(context.Background(), "updating ABORTED txn %s with COMMITTED txn %s", t.String(), o.String())
  1021  			}
  1022  		case COMMITTED:
  1023  			// Nothing to do.
  1024  		}
  1025  
  1026  		if t.ReadTimestamp.Equal(o.ReadTimestamp) {
  1027  			// If neither of the transactions has a bumped ReadTimestamp, then the
  1028  			// WriteTooOld flag is cumulative.
  1029  			t.WriteTooOld = t.WriteTooOld || o.WriteTooOld
  1030  			t.CommitTimestampFixed = t.CommitTimestampFixed || o.CommitTimestampFixed
  1031  		} else if t.ReadTimestamp.Less(o.ReadTimestamp) {
  1032  			// If `o` has a higher ReadTimestamp (i.e. it's the result of a refresh,
  1033  			// which refresh generally clears the WriteTooOld field), then it dictates
  1034  			// the WriteTooOld field. This relies on refreshes not being performed
  1035  			// concurrently with any requests whose response's WriteTooOld field
  1036  			// matters.
  1037  			t.WriteTooOld = o.WriteTooOld
  1038  			t.CommitTimestampFixed = o.CommitTimestampFixed
  1039  		}
  1040  		// If t has a higher ReadTimestamp, than it gets to dictate the
  1041  		// WriteTooOld field - so there's nothing to update.
  1042  
  1043  		if t.Sequence < o.Sequence {
  1044  			t.Sequence = o.Sequence
  1045  		}
  1046  		if len(o.LockSpans) > 0 {
  1047  			t.LockSpans = o.LockSpans
  1048  		}
  1049  		if len(o.InFlightWrites) > 0 {
  1050  			t.InFlightWrites = o.InFlightWrites
  1051  		}
  1052  		if len(o.IgnoredSeqNums) > 0 {
  1053  			t.IgnoredSeqNums = o.IgnoredSeqNums
  1054  		}
  1055  	} else /* t.Epoch > o.Epoch */ {
  1056  		// Ignore epoch-specific state from previous epoch. However, ensure that
  1057  		// the transaction status still makes sense.
  1058  		switch o.Status {
  1059  		case ABORTED:
  1060  			// Once aborted, always aborted. The transaction coordinator might
  1061  			// have incremented the txn's epoch without realizing that it was
  1062  			// aborted.
  1063  			t.Status = ABORTED
  1064  		case COMMITTED:
  1065  			log.Warningf(context.Background(), "updating txn %s with COMMITTED txn at earlier epoch %s", t.String(), o.String())
  1066  		}
  1067  	}
  1068  
  1069  	// Forward each of the transaction timestamps.
  1070  	t.WriteTimestamp.Forward(o.WriteTimestamp)
  1071  	t.LastHeartbeat.Forward(o.LastHeartbeat)
  1072  	t.DeprecatedOrigTimestamp.Forward(o.DeprecatedOrigTimestamp)
  1073  	t.MaxTimestamp.Forward(o.MaxTimestamp)
  1074  	t.ReadTimestamp.Forward(o.ReadTimestamp)
  1075  
  1076  	// On update, set lower bound timestamps to the minimum seen by either txn.
  1077  	// These shouldn't differ unless one of them is empty, but we're careful
  1078  	// anyway.
  1079  	if t.MinTimestamp == (hlc.Timestamp{}) {
  1080  		t.MinTimestamp = o.MinTimestamp
  1081  	} else if o.MinTimestamp != (hlc.Timestamp{}) {
  1082  		t.MinTimestamp.Backward(o.MinTimestamp)
  1083  	}
  1084  
  1085  	// Absorb the collected clock uncertainty information.
  1086  	for _, v := range o.ObservedTimestamps {
  1087  		t.UpdateObservedTimestamp(v.NodeID, v.Timestamp)
  1088  	}
  1089  
  1090  	// Ratchet the transaction priority.
  1091  	t.UpgradePriority(o.Priority)
  1092  }
  1093  
  1094  // UpgradePriority sets transaction priority to the maximum of current
  1095  // priority and the specified minPriority. The exception is if the
  1096  // current priority is set to the minimum, in which case the minimum
  1097  // is preserved.
  1098  func (t *Transaction) UpgradePriority(minPriority enginepb.TxnPriority) {
  1099  	if minPriority > t.Priority && t.Priority != enginepb.MinTxnPriority {
  1100  		t.Priority = minPriority
  1101  	}
  1102  }
  1103  
  1104  // IsLocking returns whether the transaction has begun acquiring locks.
  1105  // This method will never return false for a writing transaction.
  1106  func (t *Transaction) IsLocking() bool {
  1107  	return t.Key != nil
  1108  }
  1109  
  1110  // String formats transaction into human readable string.
  1111  //
  1112  // NOTE: When updating String(), you probably want to also update SafeMessage().
  1113  func (t Transaction) String() string {
  1114  	var buf strings.Builder
  1115  	if len(t.Name) > 0 {
  1116  		fmt.Fprintf(&buf, "%q ", t.Name)
  1117  	}
  1118  	fmt.Fprintf(&buf, "meta={%s} lock=%t stat=%s rts=%s wto=%t max=%s",
  1119  		t.TxnMeta, t.IsLocking(), t.Status, t.ReadTimestamp, t.WriteTooOld, t.MaxTimestamp)
  1120  	if ni := len(t.LockSpans); t.Status != PENDING && ni > 0 {
  1121  		fmt.Fprintf(&buf, " int=%d", ni)
  1122  	}
  1123  	if nw := len(t.InFlightWrites); t.Status != PENDING && nw > 0 {
  1124  		fmt.Fprintf(&buf, " ifw=%d", nw)
  1125  	}
  1126  	if ni := len(t.IgnoredSeqNums); ni > 0 {
  1127  		fmt.Fprintf(&buf, " isn=%d", ni)
  1128  	}
  1129  	return buf.String()
  1130  }
  1131  
  1132  // SafeMessage implements the SafeMessager interface.
  1133  //
  1134  // This method should be kept largely synchronized with String(), except that it
  1135  // can't include sensitive info (e.g. the transaction key).
  1136  func (t Transaction) SafeMessage() string {
  1137  	var buf strings.Builder
  1138  	if len(t.Name) > 0 {
  1139  		fmt.Fprintf(&buf, "%q ", t.Name)
  1140  	}
  1141  	fmt.Fprintf(&buf, "meta={%s} lock=%t stat=%s rts=%s wto=%t max=%s",
  1142  		t.TxnMeta.SafeMessage(), t.IsLocking(), t.Status, t.ReadTimestamp, t.WriteTooOld, t.MaxTimestamp)
  1143  	if ni := len(t.LockSpans); t.Status != PENDING && ni > 0 {
  1144  		fmt.Fprintf(&buf, " int=%d", ni)
  1145  	}
  1146  	if nw := len(t.InFlightWrites); t.Status != PENDING && nw > 0 {
  1147  		fmt.Fprintf(&buf, " ifw=%d", nw)
  1148  	}
  1149  	if ni := len(t.IgnoredSeqNums); ni > 0 {
  1150  		fmt.Fprintf(&buf, " isn=%d", ni)
  1151  	}
  1152  	return buf.String()
  1153  }
  1154  
  1155  // ResetObservedTimestamps clears out all timestamps recorded from individual
  1156  // nodes.
  1157  func (t *Transaction) ResetObservedTimestamps() {
  1158  	t.ObservedTimestamps = nil
  1159  }
  1160  
  1161  // UpdateObservedTimestamp stores a timestamp off a node's clock for future
  1162  // operations in the transaction. When multiple calls are made for a single
  1163  // nodeID, the lowest timestamp prevails.
  1164  func (t *Transaction) UpdateObservedTimestamp(nodeID NodeID, maxTS hlc.Timestamp) {
  1165  	// Fast path optimization for either no observed timestamps or
  1166  	// exactly one, for the same nodeID as we're updating.
  1167  	if l := len(t.ObservedTimestamps); l == 0 {
  1168  		t.ObservedTimestamps = []ObservedTimestamp{{NodeID: nodeID, Timestamp: maxTS}}
  1169  		return
  1170  	} else if l == 1 && t.ObservedTimestamps[0].NodeID == nodeID {
  1171  		if maxTS.Less(t.ObservedTimestamps[0].Timestamp) {
  1172  			t.ObservedTimestamps = []ObservedTimestamp{{NodeID: nodeID, Timestamp: maxTS}}
  1173  		}
  1174  		return
  1175  	}
  1176  	s := observedTimestampSlice(t.ObservedTimestamps)
  1177  	t.ObservedTimestamps = s.update(nodeID, maxTS)
  1178  }
  1179  
  1180  // GetObservedTimestamp returns the lowest HLC timestamp recorded from the
  1181  // given node's clock during the transaction. The returned boolean is false if
  1182  // no observation about the requested node was found. Otherwise, MaxTimestamp
  1183  // can be lowered to the returned timestamp when reading from nodeID.
  1184  func (t *Transaction) GetObservedTimestamp(nodeID NodeID) (hlc.Timestamp, bool) {
  1185  	s := observedTimestampSlice(t.ObservedTimestamps)
  1186  	return s.get(nodeID)
  1187  }
  1188  
  1189  // AddIgnoredSeqNumRange adds the given range to the given list of
  1190  // ignored seqnum ranges. Since none of the references held by a Transaction
  1191  // allow interior mutations, the existing list is copied instead of being
  1192  // mutated in place.
  1193  //
  1194  // The following invariants are assumed to hold and are preserved:
  1195  // - the list contains no overlapping ranges
  1196  // - the list contains no contiguous ranges
  1197  // - the list is sorted, with larger seqnums at the end
  1198  //
  1199  // Additionally, the caller must ensure:
  1200  //
  1201  // 1) if the new range overlaps with some range in the list, then it
  1202  //    also overlaps with every subsequent range in the list.
  1203  //
  1204  // 2) the new range's "end" seqnum is larger or equal to the "end"
  1205  //    seqnum of the last element in the list.
  1206  //
  1207  // For example:
  1208  //     current list [3 5] [10 20] [22 24]
  1209  //     new item:    [8 26]
  1210  //     final list:  [3 5] [8 26]
  1211  //
  1212  //     current list [3 5] [10 20] [22 24]
  1213  //     new item:    [28 32]
  1214  //     final list:  [3 5] [10 20] [22 24] [28 32]
  1215  //
  1216  // This corresponds to savepoints semantics:
  1217  //
  1218  // - Property 1 says that a rollback to an earlier savepoint
  1219  //   rolls back over all writes following that savepoint.
  1220  // - Property 2 comes from that the new range's 'end' seqnum is the
  1221  //   current write seqnum and thus larger than or equal to every
  1222  //   previously seen value.
  1223  func (t *Transaction) AddIgnoredSeqNumRange(newRange enginepb.IgnoredSeqNumRange) {
  1224  	// Truncate the list at the last element not included in the new range.
  1225  
  1226  	list := t.IgnoredSeqNums
  1227  	i := sort.Search(len(list), func(i int) bool {
  1228  		return list[i].End >= newRange.Start
  1229  	})
  1230  
  1231  	cpy := make([]enginepb.IgnoredSeqNumRange, i+1)
  1232  	copy(cpy[:i], list[:i])
  1233  	cpy[i] = newRange
  1234  	t.IgnoredSeqNums = cpy
  1235  }
  1236  
  1237  // AsRecord returns a TransactionRecord object containing only the subset of
  1238  // fields from the receiver that must be persisted in the transaction record.
  1239  func (t *Transaction) AsRecord() TransactionRecord {
  1240  	var tr TransactionRecord
  1241  	tr.TxnMeta = t.TxnMeta
  1242  	tr.Status = t.Status
  1243  	tr.LastHeartbeat = t.LastHeartbeat
  1244  	tr.LockSpans = t.LockSpans
  1245  	tr.InFlightWrites = t.InFlightWrites
  1246  	tr.IgnoredSeqNums = t.IgnoredSeqNums
  1247  	return tr
  1248  }
  1249  
  1250  // AsTransaction returns a Transaction object containing populated fields for
  1251  // state in the transaction record and empty fields for state omitted from the
  1252  // transaction record.
  1253  func (tr *TransactionRecord) AsTransaction() Transaction {
  1254  	var t Transaction
  1255  	t.TxnMeta = tr.TxnMeta
  1256  	t.Status = tr.Status
  1257  	t.LastHeartbeat = tr.LastHeartbeat
  1258  	t.LockSpans = tr.LockSpans
  1259  	t.InFlightWrites = tr.InFlightWrites
  1260  	t.IgnoredSeqNums = tr.IgnoredSeqNums
  1261  	return t
  1262  }
  1263  
  1264  // PrepareTransactionForRetry returns a new Transaction to be used for retrying
  1265  // the original Transaction. Depending on the error, this might return an
  1266  // already-existing Transaction with an incremented epoch, or a completely new
  1267  // Transaction.
  1268  //
  1269  // The caller should generally check that the error was
  1270  // meant for this Transaction before calling this.
  1271  //
  1272  // pri is the priority that should be used when giving the restarted transaction
  1273  // the chance to get a higher priority. Not used when the transaction is being
  1274  // aborted.
  1275  //
  1276  // In case retryErr tells us that a new Transaction needs to be created,
  1277  // isolation and name help initialize this new transaction.
  1278  func PrepareTransactionForRetry(
  1279  	ctx context.Context, pErr *Error, pri UserPriority, clock *hlc.Clock,
  1280  ) Transaction {
  1281  	if pErr.TransactionRestart == TransactionRestart_NONE {
  1282  		log.Fatalf(ctx, "invalid retryable err (%T): %s", pErr.GetDetail(), pErr)
  1283  	}
  1284  
  1285  	if pErr.GetTxn() == nil {
  1286  		log.Fatalf(ctx, "missing txn for retryable error: %s", pErr)
  1287  	}
  1288  
  1289  	txn := *pErr.GetTxn()
  1290  	aborted := false
  1291  	switch tErr := pErr.GetDetail().(type) {
  1292  	case *TransactionAbortedError:
  1293  		// The txn coming with a TransactionAbortedError is not supposed to be used
  1294  		// for the restart. Instead, a brand new transaction is created.
  1295  		aborted = true
  1296  		// TODO(andrei): Should we preserve the ObservedTimestamps across the
  1297  		// restart?
  1298  		errTxnPri := txn.Priority
  1299  		// Start the new transaction at the current time from the local clock.
  1300  		// The local hlc should have been advanced to at least the error's
  1301  		// timestamp already.
  1302  		now := clock.Now()
  1303  		txn = MakeTransaction(
  1304  			txn.Name,
  1305  			nil, // baseKey
  1306  			// We have errTxnPri, but this wants a UserPriority. So we're going to
  1307  			// overwrite the priority below.
  1308  			NormalUserPriority,
  1309  			now,
  1310  			clock.MaxOffset().Nanoseconds(),
  1311  		)
  1312  		// Use the priority communicated back by the server.
  1313  		txn.Priority = errTxnPri
  1314  	case *ReadWithinUncertaintyIntervalError:
  1315  		txn.WriteTimestamp.Forward(
  1316  			readWithinUncertaintyIntervalRetryTimestamp(ctx, &txn, tErr, pErr.OriginNode))
  1317  	case *TransactionPushError:
  1318  		// Increase timestamp if applicable, ensuring that we're just ahead of
  1319  		// the pushee.
  1320  		txn.WriteTimestamp.Forward(tErr.PusheeTxn.WriteTimestamp)
  1321  		txn.UpgradePriority(tErr.PusheeTxn.Priority - 1)
  1322  	case *TransactionRetryError:
  1323  		// Nothing to do. Transaction.Timestamp has already been forwarded to be
  1324  		// ahead of any timestamp cache entries or newer versions which caused
  1325  		// the restart.
  1326  	case *WriteTooOldError:
  1327  		// Increase the timestamp to the ts at which we've actually written.
  1328  		txn.WriteTimestamp.Forward(writeTooOldRetryTimestamp(&txn, tErr))
  1329  	default:
  1330  		log.Fatalf(ctx, "invalid retryable err (%T): %s", pErr.GetDetail(), pErr)
  1331  	}
  1332  	if !aborted {
  1333  		if txn.Status.IsFinalized() {
  1334  			log.Fatalf(ctx, "transaction unexpectedly finalized in (%T): %s", pErr.GetDetail(), pErr)
  1335  		}
  1336  		txn.Restart(pri, txn.Priority, txn.WriteTimestamp)
  1337  	}
  1338  	return txn
  1339  }
  1340  
  1341  // CanTransactionRetryAtRefreshedTimestamp returns whether the transaction
  1342  // specified in the supplied error can be retried at a refreshed timestamp to
  1343  // avoid a client-side transaction restart. If true, returns a cloned, updated
  1344  // Transaction object with the provisional commit timestamp and refreshed
  1345  // timestamp set appropriately.
  1346  func CanTransactionRetryAtRefreshedTimestamp(
  1347  	ctx context.Context, pErr *Error,
  1348  ) (bool, *Transaction) {
  1349  	txn := pErr.GetTxn()
  1350  	if txn == nil || txn.CommitTimestampFixed {
  1351  		return false, nil
  1352  	}
  1353  	timestamp := txn.WriteTimestamp
  1354  	switch err := pErr.GetDetail().(type) {
  1355  	case *TransactionRetryError:
  1356  		if err.Reason != RETRY_SERIALIZABLE && err.Reason != RETRY_WRITE_TOO_OLD {
  1357  			return false, nil
  1358  		}
  1359  	case *WriteTooOldError:
  1360  		// TODO(andrei): Chances of success for on write-too-old conditions might be
  1361  		// usually small: if our txn previously read the key that generated this
  1362  		// error, obviously the refresh will fail. It might be worth trying to
  1363  		// detect these cases and save the futile attempt; we'd need to have access
  1364  		// to the key that generated the error.
  1365  		timestamp.Forward(writeTooOldRetryTimestamp(txn, err))
  1366  	case *ReadWithinUncertaintyIntervalError:
  1367  		timestamp.Forward(
  1368  			readWithinUncertaintyIntervalRetryTimestamp(ctx, txn, err, pErr.OriginNode))
  1369  	default:
  1370  		return false, nil
  1371  	}
  1372  
  1373  	newTxn := txn.Clone()
  1374  	newTxn.WriteTimestamp.Forward(timestamp)
  1375  	newTxn.ReadTimestamp.Forward(newTxn.WriteTimestamp)
  1376  	newTxn.WriteTooOld = false
  1377  
  1378  	return true, newTxn
  1379  }
  1380  
  1381  func readWithinUncertaintyIntervalRetryTimestamp(
  1382  	ctx context.Context, txn *Transaction, err *ReadWithinUncertaintyIntervalError, origin NodeID,
  1383  ) hlc.Timestamp {
  1384  	// If the reader encountered a newer write within the uncertainty
  1385  	// interval, we advance the txn's timestamp just past the last observed
  1386  	// timestamp from the node.
  1387  	ts, ok := txn.GetObservedTimestamp(origin)
  1388  	if !ok {
  1389  		log.Fatalf(ctx,
  1390  			"missing observed timestamp for node %d found on uncertainty restart. "+
  1391  				"err: %s. txn: %s. Observed timestamps: %v",
  1392  			origin, err, txn, txn.ObservedTimestamps)
  1393  	}
  1394  	// Also forward by the existing timestamp.
  1395  	ts.Forward(err.ExistingTimestamp.Next())
  1396  	return ts
  1397  }
  1398  
  1399  func writeTooOldRetryTimestamp(txn *Transaction, err *WriteTooOldError) hlc.Timestamp {
  1400  	return err.ActualTimestamp
  1401  }
  1402  
  1403  // Replicas returns all of the replicas present in the descriptor after this
  1404  // trigger applies.
  1405  func (crt ChangeReplicasTrigger) Replicas() []ReplicaDescriptor {
  1406  	if crt.Desc != nil {
  1407  		return crt.Desc.Replicas().All()
  1408  	}
  1409  	return crt.DeprecatedUpdatedReplicas
  1410  }
  1411  
  1412  // NextReplicaID returns the next replica id to use after this trigger applies.
  1413  func (crt ChangeReplicasTrigger) NextReplicaID() ReplicaID {
  1414  	if crt.Desc != nil {
  1415  		return crt.Desc.NextReplicaID
  1416  	}
  1417  	return crt.DeprecatedNextReplicaID
  1418  }
  1419  
  1420  // ConfChange returns the configuration change described by the trigger.
  1421  func (crt ChangeReplicasTrigger) ConfChange(encodedCtx []byte) (raftpb.ConfChangeI, error) {
  1422  	return confChangeImpl(crt, encodedCtx)
  1423  }
  1424  
  1425  func (crt ChangeReplicasTrigger) alwaysV2() bool {
  1426  	// NB: we can return true in 20.1, but we don't win anything unless
  1427  	// we are actively trying to migrate out of V1 membership changes, which
  1428  	// could modestly simplify small areas of our codebase.
  1429  	return false
  1430  }
  1431  
  1432  // confChangeImpl is the implementation of (ChangeReplicasTrigger).ConfChange
  1433  // narrowed down to the inputs it actually needs for better testability.
  1434  func confChangeImpl(
  1435  	crt interface {
  1436  		Added() []ReplicaDescriptor
  1437  		Removed() []ReplicaDescriptor
  1438  		Replicas() []ReplicaDescriptor
  1439  		alwaysV2() bool
  1440  	},
  1441  	encodedCtx []byte,
  1442  ) (raftpb.ConfChangeI, error) {
  1443  	added, removed, replicas := crt.Added(), crt.Removed(), crt.Replicas()
  1444  
  1445  	var sl []raftpb.ConfChangeSingle
  1446  
  1447  	checkExists := func(in ReplicaDescriptor) error {
  1448  		for _, rDesc := range replicas {
  1449  			if rDesc.ReplicaID == in.ReplicaID {
  1450  				if a, b := in.GetType(), rDesc.GetType(); a != b {
  1451  					return errors.Errorf("have %s, but descriptor has %s", in, rDesc)
  1452  				}
  1453  				return nil
  1454  			}
  1455  		}
  1456  		return errors.Errorf("%s missing from descriptors %v", in, replicas)
  1457  	}
  1458  	checkNotExists := func(in ReplicaDescriptor) error {
  1459  		for _, rDesc := range replicas {
  1460  			if rDesc.ReplicaID == in.ReplicaID {
  1461  				return errors.Errorf("%s must no longer be present in descriptor", in)
  1462  			}
  1463  		}
  1464  		return nil
  1465  	}
  1466  
  1467  	for _, rDesc := range removed {
  1468  		sl = append(sl, raftpb.ConfChangeSingle{
  1469  			Type:   raftpb.ConfChangeRemoveNode,
  1470  			NodeID: uint64(rDesc.ReplicaID),
  1471  		})
  1472  
  1473  		switch rDesc.GetType() {
  1474  		case VOTER_OUTGOING:
  1475  			// If a voter is removed through joint consensus, it will
  1476  			// be turned into an outgoing voter first.
  1477  			if err := checkExists(rDesc); err != nil {
  1478  				return nil, err
  1479  			}
  1480  		case VOTER_DEMOTING:
  1481  			// If a voter is demoted through joint consensus, it will
  1482  			// be turned into a demoting voter first.
  1483  			if err := checkExists(rDesc); err != nil {
  1484  				return nil, err
  1485  			}
  1486  			// It's being re-added as a learner, not only removed.
  1487  			sl = append(sl, raftpb.ConfChangeSingle{
  1488  				Type:   raftpb.ConfChangeAddLearnerNode,
  1489  				NodeID: uint64(rDesc.ReplicaID),
  1490  			})
  1491  		case LEARNER:
  1492  			// A learner could in theory show up in the descriptor if the
  1493  			// removal was really a demotion and no joint consensus is used.
  1494  			// But etcd/raft currently forces us to go through joint consensus
  1495  			// when demoting, so demotions will always have a VOTER_DEMOTING
  1496  			// instead. We must be straight-up removing a voter or learner, so
  1497  			// the target should be gone from the descriptor at this point.
  1498  			if err := checkNotExists(rDesc); err != nil {
  1499  				return nil, err
  1500  			}
  1501  		case VOTER_FULL:
  1502  			// A voter can't be in the descriptor if it's being removed.
  1503  			if err := checkNotExists(rDesc); err != nil {
  1504  				return nil, err
  1505  			}
  1506  		default:
  1507  			return nil, errors.Errorf("can't remove replica in state %v", rDesc.GetType())
  1508  		}
  1509  	}
  1510  
  1511  	for _, rDesc := range added {
  1512  		// The incoming descriptor must also be present in the set of all
  1513  		// replicas, which is ultimately the authoritative one because that's
  1514  		// what's written to the KV store.
  1515  		if err := checkExists(rDesc); err != nil {
  1516  			return nil, err
  1517  		}
  1518  
  1519  		var changeType raftpb.ConfChangeType
  1520  		switch rDesc.GetType() {
  1521  		case VOTER_FULL:
  1522  			// We're adding a new voter.
  1523  			changeType = raftpb.ConfChangeAddNode
  1524  		case VOTER_INCOMING:
  1525  			// We're adding a voter, but will transition into a joint config
  1526  			// first.
  1527  			changeType = raftpb.ConfChangeAddNode
  1528  		case LEARNER:
  1529  			// We're adding a learner.
  1530  			// Note that we're guaranteed by virtue of the upstream
  1531  			// ChangeReplicas txn that this learner is not currently a voter.
  1532  			// Demotions (i.e. transitioning from voter to learner) are not
  1533  			// represented in `added`; they're handled in `removed` above.
  1534  			changeType = raftpb.ConfChangeAddLearnerNode
  1535  		default:
  1536  			// A voter that is demoting was just removed and re-added in the
  1537  			// `removals` handler. We should not see it again here.
  1538  			// A voter that's outgoing similarly has no reason to show up here.
  1539  			return nil, errors.Errorf("can't add replica in state %v", rDesc.GetType())
  1540  		}
  1541  		sl = append(sl, raftpb.ConfChangeSingle{
  1542  			Type:   changeType,
  1543  			NodeID: uint64(rDesc.ReplicaID),
  1544  		})
  1545  	}
  1546  
  1547  	// Check whether we're entering a joint state. This is the case precisely when
  1548  	// the resulting descriptors tells us that this is the case. Note that we've
  1549  	// made sure above that all of the additions/removals are in tune with that
  1550  	// descriptor already.
  1551  	var enteringJoint bool
  1552  	for _, rDesc := range replicas {
  1553  		switch rDesc.GetType() {
  1554  		case VOTER_INCOMING, VOTER_OUTGOING, VOTER_DEMOTING:
  1555  			enteringJoint = true
  1556  		default:
  1557  		}
  1558  	}
  1559  	wantLeaveJoint := len(added)+len(removed) == 0
  1560  	if !enteringJoint {
  1561  		if len(added)+len(removed) > 1 {
  1562  			return nil, errors.Errorf("change requires joint consensus")
  1563  		}
  1564  	} else if wantLeaveJoint {
  1565  		return nil, errors.Errorf("descriptor enters joint state, but trigger is requesting to leave one")
  1566  	}
  1567  
  1568  	var cc raftpb.ConfChangeI
  1569  
  1570  	if enteringJoint || crt.alwaysV2() {
  1571  		// V2 membership changes, which allow atomic replication changes. We
  1572  		// track the joint state in the range descriptor and thus we need to be
  1573  		// in charge of when to leave the joint state.
  1574  		transition := raftpb.ConfChangeTransitionJointExplicit
  1575  		if !enteringJoint {
  1576  			// If we're using V2 just to avoid V1 (and not because we actually
  1577  			// have a change that requires V2), then use an auto transition
  1578  			// which skips the joint state. This is necessary: our descriptor
  1579  			// says we're not supposed to go through one.
  1580  			transition = raftpb.ConfChangeTransitionAuto
  1581  		}
  1582  		cc = raftpb.ConfChangeV2{
  1583  			Transition: transition,
  1584  			Changes:    sl,
  1585  			Context:    encodedCtx,
  1586  		}
  1587  	} else if wantLeaveJoint {
  1588  		// Transitioning out of a joint config.
  1589  		cc = raftpb.ConfChangeV2{
  1590  			Context: encodedCtx,
  1591  		}
  1592  	} else {
  1593  		// Legacy path with exactly one change.
  1594  		cc = raftpb.ConfChange{
  1595  			Type:    sl[0].Type,
  1596  			NodeID:  sl[0].NodeID,
  1597  			Context: encodedCtx,
  1598  		}
  1599  	}
  1600  	return cc, nil
  1601  }
  1602  
  1603  var _ fmt.Stringer = &ChangeReplicasTrigger{}
  1604  
  1605  func (crt ChangeReplicasTrigger) String() string {
  1606  	var nextReplicaID ReplicaID
  1607  	var afterReplicas []ReplicaDescriptor
  1608  	added, removed := crt.Added(), crt.Removed()
  1609  	if crt.Desc != nil {
  1610  		nextReplicaID = crt.Desc.NextReplicaID
  1611  		// NB: we don't want to mutate InternalReplicas, so we don't call
  1612  		// .Replicas()
  1613  		//
  1614  		// TODO(tbg): revisit after #39489 is merged.
  1615  		afterReplicas = crt.Desc.InternalReplicas
  1616  	} else {
  1617  		nextReplicaID = crt.DeprecatedNextReplicaID
  1618  		afterReplicas = crt.DeprecatedUpdatedReplicas
  1619  	}
  1620  	var chgS strings.Builder
  1621  	cc, err := crt.ConfChange(nil)
  1622  	if err != nil {
  1623  		fmt.Fprintf(&chgS, "<malformed ChangeReplicasTrigger: %s>", err)
  1624  	} else {
  1625  		ccv2 := cc.AsV2()
  1626  		if ccv2.LeaveJoint() {
  1627  			// NB: this isn't missing a trailing space.
  1628  			//
  1629  			// TODO(tbg): could list the replicas that will actually leave the
  1630  			// voter set.
  1631  			fmt.Fprintf(&chgS, "LEAVE_JOINT")
  1632  		} else if _, ok := ccv2.EnterJoint(); ok {
  1633  			fmt.Fprintf(&chgS, "ENTER_JOINT(%s) ", raftpb.ConfChangesToString(ccv2.Changes))
  1634  		} else {
  1635  			fmt.Fprintf(&chgS, "SIMPLE(%s) ", raftpb.ConfChangesToString(ccv2.Changes))
  1636  		}
  1637  	}
  1638  	if len(added) > 0 {
  1639  		fmt.Fprintf(&chgS, "%s%s", ADD_REPLICA, added)
  1640  	}
  1641  	if len(removed) > 0 {
  1642  		if len(added) > 0 {
  1643  			chgS.WriteString(", ")
  1644  		}
  1645  		fmt.Fprintf(&chgS, "%s%s", REMOVE_REPLICA, removed)
  1646  	}
  1647  	fmt.Fprintf(&chgS, ": after=%s next=%d", afterReplicas, nextReplicaID)
  1648  	return chgS.String()
  1649  }
  1650  
  1651  func (crt ChangeReplicasTrigger) legacy() (ReplicaDescriptor, bool) {
  1652  	if len(crt.InternalAddedReplicas)+len(crt.InternalRemovedReplicas) == 0 && crt.DeprecatedReplica.ReplicaID != 0 {
  1653  		return crt.DeprecatedReplica, true
  1654  	}
  1655  	return ReplicaDescriptor{}, false
  1656  }
  1657  
  1658  // Added returns the replicas added by this change (if there are any).
  1659  func (crt ChangeReplicasTrigger) Added() []ReplicaDescriptor {
  1660  	if rDesc, ok := crt.legacy(); ok && crt.DeprecatedChangeType == ADD_REPLICA {
  1661  		return []ReplicaDescriptor{rDesc}
  1662  	}
  1663  	return crt.InternalAddedReplicas
  1664  }
  1665  
  1666  // Removed returns the replicas whose removal is initiated by this change (if there are any).
  1667  // Note that in an atomic replication change, Removed() contains the replicas when they are
  1668  // transitioning to VOTER_{OUTGOING,DEMOTING} (from VOTER_FULL). The subsequent trigger
  1669  // leaving the joint configuration has an empty Removed().
  1670  func (crt ChangeReplicasTrigger) Removed() []ReplicaDescriptor {
  1671  	if rDesc, ok := crt.legacy(); ok && crt.DeprecatedChangeType == REMOVE_REPLICA {
  1672  		return []ReplicaDescriptor{rDesc}
  1673  	}
  1674  	return crt.InternalRemovedReplicas
  1675  }
  1676  
  1677  // LeaseSequence is a custom type for a lease sequence number.
  1678  type LeaseSequence int64
  1679  
  1680  // String implements the fmt.Stringer interface.
  1681  func (s LeaseSequence) String() string {
  1682  	return strconv.FormatInt(int64(s), 10)
  1683  }
  1684  
  1685  var _ fmt.Stringer = &Lease{}
  1686  
  1687  func (l Lease) String() string {
  1688  	var proposedSuffix string
  1689  	if l.ProposedTS != nil {
  1690  		proposedSuffix = fmt.Sprintf(" pro=%s", l.ProposedTS)
  1691  	}
  1692  	if l.Type() == LeaseExpiration {
  1693  		return fmt.Sprintf("repl=%s seq=%s start=%s exp=%s%s", l.Replica, l.Sequence, l.Start, l.Expiration, proposedSuffix)
  1694  	}
  1695  	return fmt.Sprintf("repl=%s seq=%s start=%s epo=%d%s", l.Replica, l.Sequence, l.Start, l.Epoch, proposedSuffix)
  1696  }
  1697  
  1698  // BootstrapLease returns the lease to persist for the range of a freshly bootstrapped store. The
  1699  // returned lease is morally "empty" but has a few fields set to non-nil zero values because some
  1700  // used to be non-nullable and we now fuzz their nullability in tests. As a consequence, it's better
  1701  // to always use zero fields here so that the initial stats are constant.
  1702  func BootstrapLease() Lease {
  1703  	return Lease{
  1704  		Expiration:            &hlc.Timestamp{},
  1705  		DeprecatedStartStasis: &hlc.Timestamp{},
  1706  	}
  1707  }
  1708  
  1709  // OwnedBy returns whether the given store is the lease owner.
  1710  func (l Lease) OwnedBy(storeID StoreID) bool {
  1711  	return l.Replica.StoreID == storeID
  1712  }
  1713  
  1714  // LeaseType describes the type of lease.
  1715  type LeaseType int
  1716  
  1717  const (
  1718  	// LeaseNone specifies no lease, to be used as a default value.
  1719  	LeaseNone LeaseType = iota
  1720  	// LeaseExpiration allows range operations while the wall clock is
  1721  	// within the expiration timestamp.
  1722  	LeaseExpiration
  1723  	// LeaseEpoch allows range operations while the node liveness epoch
  1724  	// is equal to the lease epoch.
  1725  	LeaseEpoch
  1726  )
  1727  
  1728  // Type returns the lease type.
  1729  func (l Lease) Type() LeaseType {
  1730  	if l.Epoch == 0 {
  1731  		return LeaseExpiration
  1732  	}
  1733  	return LeaseEpoch
  1734  }
  1735  
  1736  // Equivalent determines whether ol is considered the same lease
  1737  // for the purposes of matching leases when executing a command.
  1738  // For expiration-based leases, extensions are allowed.
  1739  // Ignore proposed timestamps for lease verification; for epoch-
  1740  // based leases, the start time of the lease is sufficient to
  1741  // avoid using an older lease with same epoch.
  1742  //
  1743  // NB: Lease.Equivalent is NOT symmetric. For expiration-based
  1744  // leases, a lease is equivalent to another with an equal or
  1745  // later expiration, but not an earlier expiration.
  1746  func (l Lease) Equivalent(newL Lease) bool {
  1747  	// Ignore proposed timestamp & deprecated start stasis.
  1748  	l.ProposedTS, newL.ProposedTS = nil, nil
  1749  	l.DeprecatedStartStasis, newL.DeprecatedStartStasis = nil, nil
  1750  	// Ignore sequence numbers, they are simply a reflection of
  1751  	// the equivalency of other fields.
  1752  	l.Sequence, newL.Sequence = 0, 0
  1753  	// Ignore the ReplicaDescriptor's type. This shouldn't affect lease
  1754  	// equivalency because Raft state shouldn't be factored into the state of a
  1755  	// Replica's lease. We don't expect a leaseholder to ever become a LEARNER
  1756  	// replica, but that also shouldn't prevent it from extending its lease. The
  1757  	// code also avoids a potential bug where an unset ReplicaType and a set
  1758  	// VOTER ReplicaType are considered distinct and non-equivalent.
  1759  	//
  1760  	// Change this line to the following when ReplicaType becomes non-nullable:
  1761  	//  l.Replica.Type, newL.Replica.Type = 0, 0
  1762  	l.Replica.Type, newL.Replica.Type = nil, nil
  1763  	// If both leases are epoch-based, we must dereference the epochs
  1764  	// and then set to nil.
  1765  	switch l.Type() {
  1766  	case LeaseEpoch:
  1767  		// Ignore expirations. This seems benign but since we changed the
  1768  		// nullability of this field in the 1.2 cycle, it's crucial and
  1769  		// tested in TestLeaseEquivalence.
  1770  		l.Expiration, newL.Expiration = nil, nil
  1771  
  1772  		if l.Epoch == newL.Epoch {
  1773  			l.Epoch, newL.Epoch = 0, 0
  1774  		}
  1775  	case LeaseExpiration:
  1776  		// See the comment above, though this field's nullability wasn't
  1777  		// changed. We nil it out for completeness only.
  1778  		l.Epoch, newL.Epoch = 0, 0
  1779  
  1780  		// For expiration-based leases, extensions are considered equivalent.
  1781  		// This is the one case where Equivalent is not commutative and, as
  1782  		// such, requires special handling beneath Raft (see checkForcedErrLocked).
  1783  		if l.GetExpiration().LessEq(newL.GetExpiration()) {
  1784  			l.Expiration, newL.Expiration = nil, nil
  1785  		}
  1786  	}
  1787  	return l == newL
  1788  }
  1789  
  1790  // GetExpiration returns the lease expiration or the zero timestamp if the
  1791  // receiver is not an expiration-based lease.
  1792  func (l Lease) GetExpiration() hlc.Timestamp {
  1793  	if l.Expiration == nil {
  1794  		return hlc.Timestamp{}
  1795  	}
  1796  	return *l.Expiration
  1797  }
  1798  
  1799  // equivalentTimestamps compares two timestamps for equality and also considers
  1800  // the nil timestamp equal to the zero timestamp.
  1801  func equivalentTimestamps(a, b *hlc.Timestamp) bool {
  1802  	if a == nil {
  1803  		if b == nil {
  1804  			return true
  1805  		}
  1806  		if (*b == hlc.Timestamp{}) {
  1807  			return true
  1808  		}
  1809  	} else if b == nil {
  1810  		if (*a == hlc.Timestamp{}) {
  1811  			return true
  1812  		}
  1813  	}
  1814  	return a.Equal(b)
  1815  }
  1816  
  1817  // Equal implements the gogoproto Equal interface. This implementation is
  1818  // forked from the gogoproto generated code to allow l.Expiration == nil and
  1819  // l.Expiration == &hlc.Timestamp{} to compare equal. Ditto for
  1820  // DeprecatedStartStasis.
  1821  func (l *Lease) Equal(that interface{}) bool {
  1822  	if that == nil {
  1823  		return l == nil
  1824  	}
  1825  
  1826  	that1, ok := that.(*Lease)
  1827  	if !ok {
  1828  		that2, ok := that.(Lease)
  1829  		if ok {
  1830  			that1 = &that2
  1831  		} else {
  1832  			return false
  1833  		}
  1834  	}
  1835  	if that1 == nil {
  1836  		return l == nil
  1837  	} else if l == nil {
  1838  		return false
  1839  	}
  1840  
  1841  	if !l.Start.Equal(&that1.Start) {
  1842  		return false
  1843  	}
  1844  	if !equivalentTimestamps(l.Expiration, that1.Expiration) {
  1845  		return false
  1846  	}
  1847  	if !l.Replica.Equal(&that1.Replica) {
  1848  		return false
  1849  	}
  1850  	if !equivalentTimestamps(l.DeprecatedStartStasis, that1.DeprecatedStartStasis) {
  1851  		return false
  1852  	}
  1853  	if !l.ProposedTS.Equal(that1.ProposedTS) {
  1854  		return false
  1855  	}
  1856  	if l.Epoch != that1.Epoch {
  1857  		return false
  1858  	}
  1859  	if l.Sequence != that1.Sequence {
  1860  		return false
  1861  	}
  1862  	return true
  1863  }
  1864  
  1865  // MakeIntent makes an intent with the given txn and key.
  1866  // This is suitable for use when constructing WriteIntentError.
  1867  func MakeIntent(txn *enginepb.TxnMeta, key Key) Intent {
  1868  	var i Intent
  1869  	i.Key = key
  1870  	i.Txn = *txn
  1871  	return i
  1872  }
  1873  
  1874  // AsIntents takes a transaction and a slice of keys and
  1875  // returns it as a slice of intents.
  1876  func AsIntents(txn *enginepb.TxnMeta, keys []Key) []Intent {
  1877  	ret := make([]Intent, len(keys))
  1878  	for i := range keys {
  1879  		ret[i] = MakeIntent(txn, keys[i])
  1880  	}
  1881  	return ret
  1882  }
  1883  
  1884  // MakeLockAcquisition makes a lock acquisition message from the given
  1885  // txn, key, and durability level.
  1886  func MakeLockAcquisition(txn *Transaction, key Key, dur lock.Durability) LockAcquisition {
  1887  	return LockAcquisition{Span: Span{Key: key}, Txn: txn.TxnMeta, Durability: dur}
  1888  }
  1889  
  1890  // MakeLockUpdate makes a lock update from the given txn and span.
  1891  func MakeLockUpdate(txn *Transaction, span Span) LockUpdate {
  1892  	u := LockUpdate{Span: span}
  1893  	u.SetTxn(txn)
  1894  	return u
  1895  }
  1896  
  1897  // AsLockUpdates takes a slice of spans and returns it as a slice of
  1898  // lock updates.
  1899  func AsLockUpdates(txn *Transaction, spans []Span) []LockUpdate {
  1900  	ret := make([]LockUpdate, len(spans))
  1901  	for i := range spans {
  1902  		ret[i] = MakeLockUpdate(txn, spans[i])
  1903  	}
  1904  	return ret
  1905  }
  1906  
  1907  // SetTxn updates the transaction details in the lock update.
  1908  func (u *LockUpdate) SetTxn(txn *Transaction) {
  1909  	u.Txn = txn.TxnMeta
  1910  	u.Status = txn.Status
  1911  	u.IgnoredSeqNums = txn.IgnoredSeqNums
  1912  }
  1913  
  1914  // EqualValue compares for equality.
  1915  func (s Span) EqualValue(o Span) bool {
  1916  	return s.Key.Equal(o.Key) && s.EndKey.Equal(o.EndKey)
  1917  }
  1918  
  1919  // Overlaps returns true WLOG for span A and B iff:
  1920  // 1. Both spans contain one key (just the start key) and they are equal; or
  1921  // 2. The span with only one key is contained inside the other span; or
  1922  // 3. The end key of span A is strictly greater than the start key of span B
  1923  //    and the end key of span B is strictly greater than the start key of span
  1924  //    A.
  1925  func (s Span) Overlaps(o Span) bool {
  1926  	if !s.Valid() || !o.Valid() {
  1927  		return false
  1928  	}
  1929  
  1930  	if len(s.EndKey) == 0 && len(o.EndKey) == 0 {
  1931  		return s.Key.Equal(o.Key)
  1932  	} else if len(s.EndKey) == 0 {
  1933  		return bytes.Compare(s.Key, o.Key) >= 0 && bytes.Compare(s.Key, o.EndKey) < 0
  1934  	} else if len(o.EndKey) == 0 {
  1935  		return bytes.Compare(o.Key, s.Key) >= 0 && bytes.Compare(o.Key, s.EndKey) < 0
  1936  	}
  1937  	return bytes.Compare(s.EndKey, o.Key) > 0 && bytes.Compare(s.Key, o.EndKey) < 0
  1938  }
  1939  
  1940  // Combine creates a new span containing the full union of the key
  1941  // space covered by the two spans. This includes any key space not
  1942  // covered by either span, but between them if the spans are disjoint.
  1943  // Warning: using this method to combine local and non-local spans is
  1944  // not recommended and will result in potentially database-wide
  1945  // spans being returned. Use with caution.
  1946  func (s Span) Combine(o Span) Span {
  1947  	if !s.Valid() || !o.Valid() {
  1948  		return Span{}
  1949  	}
  1950  
  1951  	min := s.Key
  1952  	max := s.Key
  1953  	if len(s.EndKey) > 0 {
  1954  		max = s.EndKey
  1955  	}
  1956  	if o.Key.Compare(min) < 0 {
  1957  		min = o.Key
  1958  	} else if o.Key.Compare(max) > 0 {
  1959  		max = o.Key
  1960  	}
  1961  	if len(o.EndKey) > 0 && o.EndKey.Compare(max) > 0 {
  1962  		max = o.EndKey
  1963  	}
  1964  	if min.Equal(max) {
  1965  		return Span{Key: min}
  1966  	} else if s.Key.Equal(max) || o.Key.Equal(max) {
  1967  		return Span{Key: min, EndKey: max.Next()}
  1968  	}
  1969  	return Span{Key: min, EndKey: max}
  1970  }
  1971  
  1972  // Contains returns whether the receiver contains the given span.
  1973  func (s Span) Contains(o Span) bool {
  1974  	if !s.Valid() || !o.Valid() {
  1975  		return false
  1976  	}
  1977  
  1978  	if len(s.EndKey) == 0 && len(o.EndKey) == 0 {
  1979  		return s.Key.Equal(o.Key)
  1980  	} else if len(s.EndKey) == 0 {
  1981  		return false
  1982  	} else if len(o.EndKey) == 0 {
  1983  		return bytes.Compare(o.Key, s.Key) >= 0 && bytes.Compare(o.Key, s.EndKey) < 0
  1984  	}
  1985  	return bytes.Compare(s.Key, o.Key) <= 0 && bytes.Compare(s.EndKey, o.EndKey) >= 0
  1986  }
  1987  
  1988  // ContainsKey returns whether the span contains the given key.
  1989  func (s Span) ContainsKey(key Key) bool {
  1990  	return bytes.Compare(key, s.Key) >= 0 && bytes.Compare(key, s.EndKey) < 0
  1991  }
  1992  
  1993  // ProperlyContainsKey returns whether the span properly contains the given key.
  1994  func (s Span) ProperlyContainsKey(key Key) bool {
  1995  	return bytes.Compare(key, s.Key) > 0 && bytes.Compare(key, s.EndKey) < 0
  1996  }
  1997  
  1998  // AsRange returns the Span as an interval.Range.
  1999  func (s Span) AsRange() interval.Range {
  2000  	startKey := s.Key
  2001  	endKey := s.EndKey
  2002  	if len(endKey) == 0 {
  2003  		endKey = s.Key.Next()
  2004  		startKey = endKey[:len(startKey)]
  2005  	}
  2006  	return interval.Range{
  2007  		Start: interval.Comparable(startKey),
  2008  		End:   interval.Comparable(endKey),
  2009  	}
  2010  }
  2011  
  2012  func (s Span) String() string {
  2013  	const maxChars = math.MaxInt32
  2014  	return PrettyPrintRange(s.Key, s.EndKey, maxChars)
  2015  }
  2016  
  2017  // SplitOnKey returns two spans where the left span has EndKey and right span
  2018  // has start Key of the split key, respectively.
  2019  // If the split key lies outside the span, the original span is returned on the
  2020  // left (and right is an invalid span with empty keys).
  2021  func (s Span) SplitOnKey(key Key) (left Span, right Span) {
  2022  	// Cannot split on or before start key or on or after end key.
  2023  	if bytes.Compare(key, s.Key) <= 0 || bytes.Compare(key, s.EndKey) >= 0 {
  2024  		return s, Span{}
  2025  	}
  2026  
  2027  	return Span{Key: s.Key, EndKey: key}, Span{Key: key, EndKey: s.EndKey}
  2028  }
  2029  
  2030  // Valid returns whether or not the span is a "valid span".
  2031  // A valid span cannot have an empty start and end key and must satisfy either:
  2032  // 1. The end key is empty.
  2033  // 2. The start key is lexicographically-ordered before the end key.
  2034  func (s Span) Valid() bool {
  2035  	// s.Key can be empty if it is KeyMin.
  2036  	// Can't have both KeyMin start and end keys.
  2037  	if len(s.Key) == 0 && len(s.EndKey) == 0 {
  2038  		return false
  2039  	}
  2040  
  2041  	if len(s.EndKey) == 0 {
  2042  		return true
  2043  	}
  2044  
  2045  	if bytes.Compare(s.Key, s.EndKey) >= 0 {
  2046  		return false
  2047  	}
  2048  
  2049  	return true
  2050  }
  2051  
  2052  // Spans is a slice of spans.
  2053  type Spans []Span
  2054  
  2055  // implement Sort.Interface
  2056  func (a Spans) Len() int           { return len(a) }
  2057  func (a Spans) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
  2058  func (a Spans) Less(i, j int) bool { return a[i].Key.Compare(a[j].Key) < 0 }
  2059  
  2060  // ContainsKey returns whether any of the spans in the set of spans contains
  2061  // the given key.
  2062  func (a Spans) ContainsKey(key Key) bool {
  2063  	for _, span := range a {
  2064  		if span.ContainsKey(key) {
  2065  			return true
  2066  		}
  2067  	}
  2068  
  2069  	return false
  2070  }
  2071  
  2072  // RSpan is a key range with an inclusive start RKey and an exclusive end RKey.
  2073  type RSpan struct {
  2074  	Key, EndKey RKey
  2075  }
  2076  
  2077  // Equal compares for equality.
  2078  func (rs RSpan) Equal(o RSpan) bool {
  2079  	return rs.Key.Equal(o.Key) && rs.EndKey.Equal(o.EndKey)
  2080  }
  2081  
  2082  // ContainsKey returns whether this span contains the specified key.
  2083  func (rs RSpan) ContainsKey(key RKey) bool {
  2084  	return bytes.Compare(key, rs.Key) >= 0 && bytes.Compare(key, rs.EndKey) < 0
  2085  }
  2086  
  2087  // ContainsKeyInverted returns whether this span contains the specified key. The
  2088  // receiver span is considered inverted, meaning that instead of containing the
  2089  // range ["key","endKey"), it contains the range ("key","endKey"].
  2090  func (rs RSpan) ContainsKeyInverted(key RKey) bool {
  2091  	return bytes.Compare(key, rs.Key) > 0 && bytes.Compare(key, rs.EndKey) <= 0
  2092  }
  2093  
  2094  // ContainsKeyRange returns whether this span contains the specified
  2095  // key range from start (inclusive) to end (exclusive).
  2096  // If end is empty or start is equal to end, returns ContainsKey(start).
  2097  func (rs RSpan) ContainsKeyRange(start, end RKey) bool {
  2098  	if len(end) == 0 {
  2099  		return rs.ContainsKey(start)
  2100  	}
  2101  	if comp := bytes.Compare(end, start); comp < 0 {
  2102  		return false
  2103  	} else if comp == 0 {
  2104  		return rs.ContainsKey(start)
  2105  	}
  2106  	return bytes.Compare(start, rs.Key) >= 0 && bytes.Compare(rs.EndKey, end) >= 0
  2107  }
  2108  
  2109  func (rs RSpan) String() string {
  2110  	const maxChars = math.MaxInt32
  2111  	return PrettyPrintRange(Key(rs.Key), Key(rs.EndKey), maxChars)
  2112  }
  2113  
  2114  // Intersect returns the intersection of the current span and the
  2115  // descriptor's range. Returns an error if the span and the
  2116  // descriptor's range do not overlap.
  2117  func (rs RSpan) Intersect(desc *RangeDescriptor) (RSpan, error) {
  2118  	if !rs.Key.Less(desc.EndKey) || !desc.StartKey.Less(rs.EndKey) {
  2119  		return rs, errors.Errorf("span and descriptor's range do not overlap: %s vs %s", rs, desc)
  2120  	}
  2121  
  2122  	key := rs.Key
  2123  	if key.Less(desc.StartKey) {
  2124  		key = desc.StartKey
  2125  	}
  2126  	endKey := rs.EndKey
  2127  	if !desc.ContainsKeyRange(desc.StartKey, endKey) {
  2128  		endKey = desc.EndKey
  2129  	}
  2130  	return RSpan{key, endKey}, nil
  2131  }
  2132  
  2133  // AsRawSpanWithNoLocals returns the RSpan as a Span. This is to be used only
  2134  // in select situations in which an RSpan is known to not contain a wrapped
  2135  // locally-addressed Span.
  2136  func (rs RSpan) AsRawSpanWithNoLocals() Span {
  2137  	return Span{
  2138  		Key:    Key(rs.Key),
  2139  		EndKey: Key(rs.EndKey),
  2140  	}
  2141  }
  2142  
  2143  // KeyValueByKey implements sorting of a slice of KeyValues by key.
  2144  type KeyValueByKey []KeyValue
  2145  
  2146  // Len implements sort.Interface.
  2147  func (kv KeyValueByKey) Len() int {
  2148  	return len(kv)
  2149  }
  2150  
  2151  // Less implements sort.Interface.
  2152  func (kv KeyValueByKey) Less(i, j int) bool {
  2153  	return bytes.Compare(kv[i].Key, kv[j].Key) < 0
  2154  }
  2155  
  2156  // Swap implements sort.Interface.
  2157  func (kv KeyValueByKey) Swap(i, j int) {
  2158  	kv[i], kv[j] = kv[j], kv[i]
  2159  }
  2160  
  2161  var _ sort.Interface = KeyValueByKey{}
  2162  
  2163  // observedTimestampSlice maintains an immutable sorted list of observed
  2164  // timestamps.
  2165  type observedTimestampSlice []ObservedTimestamp
  2166  
  2167  func (s observedTimestampSlice) index(nodeID NodeID) int {
  2168  	return sort.Search(len(s),
  2169  		func(i int) bool {
  2170  			return s[i].NodeID >= nodeID
  2171  		},
  2172  	)
  2173  }
  2174  
  2175  // get the observed timestamp for the specified node, returning false if no
  2176  // timestamp exists.
  2177  func (s observedTimestampSlice) get(nodeID NodeID) (hlc.Timestamp, bool) {
  2178  	i := s.index(nodeID)
  2179  	if i < len(s) && s[i].NodeID == nodeID {
  2180  		return s[i].Timestamp, true
  2181  	}
  2182  	return hlc.Timestamp{}, false
  2183  }
  2184  
  2185  // update the timestamp for the specified node, or add a new entry in the
  2186  // correct (sorted) location. The receiver is not mutated.
  2187  func (s observedTimestampSlice) update(
  2188  	nodeID NodeID, timestamp hlc.Timestamp,
  2189  ) observedTimestampSlice {
  2190  	i := s.index(nodeID)
  2191  	if i < len(s) && s[i].NodeID == nodeID {
  2192  		if timestamp.Less(s[i].Timestamp) {
  2193  			// The input slice is immutable, so copy and update.
  2194  			cpy := make(observedTimestampSlice, len(s))
  2195  			copy(cpy, s)
  2196  			cpy[i].Timestamp = timestamp
  2197  			return cpy
  2198  		}
  2199  		return s
  2200  	}
  2201  	// The input slice is immutable, so copy and update. Don't append to
  2202  	// avoid an allocation. Doing so could invalidate a previous update
  2203  	// to this receiver.
  2204  	cpy := make(observedTimestampSlice, len(s)+1)
  2205  	copy(cpy[:i], s[:i])
  2206  	cpy[i] = ObservedTimestamp{NodeID: nodeID, Timestamp: timestamp}
  2207  	copy(cpy[i+1:], s[i:])
  2208  	return cpy
  2209  }
  2210  
  2211  // SequencedWriteBySeq implements sorting of a slice of SequencedWrites
  2212  // by sequence number.
  2213  type SequencedWriteBySeq []SequencedWrite
  2214  
  2215  // Len implements sort.Interface.
  2216  func (s SequencedWriteBySeq) Len() int { return len(s) }
  2217  
  2218  // Less implements sort.Interface.
  2219  func (s SequencedWriteBySeq) Less(i, j int) bool { return s[i].Sequence < s[j].Sequence }
  2220  
  2221  // Swap implements sort.Interface.
  2222  func (s SequencedWriteBySeq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  2223  
  2224  var _ sort.Interface = SequencedWriteBySeq{}
  2225  
  2226  // Find searches for the index of the SequencedWrite with the provided
  2227  // sequence number. Returns -1 if no corresponding write is found.
  2228  func (s SequencedWriteBySeq) Find(seq enginepb.TxnSeq) int {
  2229  	if util.RaceEnabled {
  2230  		if !sort.IsSorted(s) {
  2231  			panic("SequencedWriteBySeq must be sorted")
  2232  		}
  2233  	}
  2234  	if i := sort.Search(len(s), func(i int) bool {
  2235  		return s[i].Sequence >= seq
  2236  	}); i < len(s) && s[i].Sequence == seq {
  2237  		return i
  2238  	}
  2239  	return -1
  2240  }
  2241  
  2242  // Silence unused warning.
  2243  var _ = (SequencedWriteBySeq{}).Find
  2244  
  2245  func init() {
  2246  	// Inject the format dependency into the enginepb package.
  2247  	enginepb.FormatBytesAsKey = func(k []byte) string { return Key(k).String() }
  2248  	enginepb.FormatBytesAsValue = func(v []byte) string { return Value{RawBytes: v}.PrettyPrint() }
  2249  }