github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/data.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package roachpb 12 13 import ( 14 "bytes" 15 "context" 16 "encoding/binary" 17 "encoding/hex" 18 "fmt" 19 "hash" 20 "hash/crc32" 21 "math" 22 "math/rand" 23 "sort" 24 "strconv" 25 "strings" 26 "sync" 27 "time" 28 29 "github.com/cockroachdb/apd" 30 "github.com/cockroachdb/cockroach/pkg/geo/geopb" 31 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" 32 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 33 "github.com/cockroachdb/cockroach/pkg/util" 34 "github.com/cockroachdb/cockroach/pkg/util/bitarray" 35 "github.com/cockroachdb/cockroach/pkg/util/duration" 36 "github.com/cockroachdb/cockroach/pkg/util/encoding" 37 "github.com/cockroachdb/cockroach/pkg/util/hlc" 38 "github.com/cockroachdb/cockroach/pkg/util/interval" 39 "github.com/cockroachdb/cockroach/pkg/util/log" 40 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 41 "github.com/cockroachdb/cockroach/pkg/util/timetz" 42 "github.com/cockroachdb/cockroach/pkg/util/uuid" 43 "github.com/cockroachdb/errors" 44 "go.etcd.io/etcd/raft/raftpb" 45 ) 46 47 var ( 48 // RKeyMin is a minimum key value which sorts before all other keys. 49 RKeyMin = RKey("") 50 // KeyMin is a minimum key value which sorts before all other keys. 51 KeyMin = Key(RKeyMin) 52 // RKeyMax is a maximum key value which sorts after all other keys. 53 RKeyMax = RKey{0xff, 0xff} 54 // KeyMax is a maximum key value which sorts after all other keys. 55 KeyMax = Key(RKeyMax) 56 57 // PrettyPrintKey prints a key in human readable format. It's 58 // implemented in package git.com/cockroachdb/cockroach/keys to avoid 59 // package circle import. 60 // valDirs correspond to the encoding direction of each encoded value 61 // in the key (if known). If left unspecified, the default encoding 62 // direction for each value type is used (see 63 // encoding.go:prettyPrintFirstValue). 64 PrettyPrintKey func(valDirs []encoding.Direction, key Key) string 65 66 // PrettyPrintRange prints a key range in human readable format. It's 67 // implemented in package git.com/cockroachdb/cockroach/keys to avoid 68 // package circle import. 69 PrettyPrintRange func(start, end Key, maxChars int) string 70 ) 71 72 // RKey denotes a Key whose local addressing has been accounted for. 73 // A key can be transformed to an RKey by keys.Addr(). 74 // 75 // RKey stands for "resolved key," as in a key whose address has been resolved. 76 type RKey Key 77 78 // AsRawKey returns the RKey as a Key. This is to be used only in select 79 // situations in which an RKey is known to not contain a wrapped locally- 80 // addressed Key. That is, it must only be used when the original Key was not a 81 // local key. Whenever the Key which created the RKey is still available, it 82 // should be used instead. 83 func (rk RKey) AsRawKey() Key { 84 return Key(rk) 85 } 86 87 // Less compares two RKeys. 88 func (rk RKey) Less(otherRK RKey) bool { 89 return bytes.Compare(rk, otherRK) < 0 90 } 91 92 // Equal checks for byte-wise equality. 93 func (rk RKey) Equal(other []byte) bool { 94 return bytes.Equal(rk, other) 95 } 96 97 // Next returns the RKey that sorts immediately after the given one. 98 // The method may only take a shallow copy of the RKey, so both the 99 // receiver and the return value should be treated as immutable after. 100 func (rk RKey) Next() RKey { 101 return RKey(BytesNext(rk)) 102 } 103 104 // PrefixEnd determines the end key given key as a prefix, that is the 105 // key that sorts precisely behind all keys starting with prefix: "1" 106 // is added to the final byte and the carry propagated. The special 107 // cases of nil and KeyMin always returns KeyMax. 108 func (rk RKey) PrefixEnd() RKey { 109 if len(rk) == 0 { 110 return RKeyMax 111 } 112 return RKey(bytesPrefixEnd(rk)) 113 } 114 115 func (rk RKey) String() string { 116 return Key(rk).String() 117 } 118 119 // StringWithDirs - see Key.String.WithDirs. 120 func (rk RKey) StringWithDirs(valDirs []encoding.Direction, maxLen int) string { 121 return Key(rk).StringWithDirs(valDirs, maxLen) 122 } 123 124 // Key is a custom type for a byte string in proto 125 // messages which refer to Cockroach keys. 126 type Key []byte 127 128 // BytesNext returns the next possible byte slice, using the extra capacity 129 // of the provided slice if possible, and if not, appending an \x00. 130 func BytesNext(b []byte) []byte { 131 if cap(b) > len(b) { 132 bNext := b[:len(b)+1] 133 if bNext[len(bNext)-1] == 0 { 134 return bNext 135 } 136 } 137 // TODO(spencer): Do we need to enforce KeyMaxLength here? 138 // Switched to "make and copy" pattern in #4963 for performance. 139 bn := make([]byte, len(b)+1) 140 copy(bn, b) 141 bn[len(bn)-1] = 0 142 return bn 143 } 144 145 func bytesPrefixEnd(b []byte) []byte { 146 // Switched to "make and copy" pattern in #4963 for performance. 147 end := make([]byte, len(b)) 148 copy(end, b) 149 for i := len(end) - 1; i >= 0; i-- { 150 end[i] = end[i] + 1 151 if end[i] != 0 { 152 return end[:i+1] 153 } 154 } 155 // This statement will only be reached if the key is already a 156 // maximal byte string (i.e. already \xff...). 157 return b 158 } 159 160 // Next returns the next key in lexicographic sort order. The method may only 161 // take a shallow copy of the Key, so both the receiver and the return 162 // value should be treated as immutable after. 163 func (k Key) Next() Key { 164 return Key(BytesNext(k)) 165 } 166 167 // IsPrev is a more efficient version of k.Next().Equal(m). 168 func (k Key) IsPrev(m Key) bool { 169 l := len(m) - 1 170 return l == len(k) && m[l] == 0 && k.Equal(m[:l]) 171 } 172 173 // PrefixEnd determines the end key given key as a prefix, that is the 174 // key that sorts precisely behind all keys starting with prefix: "1" 175 // is added to the final byte and the carry propagated. The special 176 // cases of nil and KeyMin always returns KeyMax. 177 func (k Key) PrefixEnd() Key { 178 if len(k) == 0 { 179 return Key(RKeyMax) 180 } 181 return Key(bytesPrefixEnd(k)) 182 } 183 184 // Equal returns whether two keys are identical. 185 func (k Key) Equal(l Key) bool { 186 return bytes.Equal(k, l) 187 } 188 189 // Compare compares the two Keys. 190 func (k Key) Compare(b Key) int { 191 return bytes.Compare(k, b) 192 } 193 194 // String returns a string-formatted version of the key. 195 func (k Key) String() string { 196 return k.StringWithDirs(nil /* valDirs */, 0 /* maxLen */) 197 } 198 199 // StringWithDirs is the value encoding direction-aware version of String. 200 // 201 // Args: 202 // valDirs: The direction for the key's components, generally needed for correct 203 // decoding. If nil, the values are pretty-printed with default encoding 204 // direction. 205 // maxLen: If not 0, only the first maxLen chars from the decoded key are 206 // returned, plus a "..." suffix. 207 func (k Key) StringWithDirs(valDirs []encoding.Direction, maxLen int) string { 208 var s string 209 if PrettyPrintKey != nil { 210 s = PrettyPrintKey(valDirs, k) 211 } else { 212 s = fmt.Sprintf("%q", []byte(k)) 213 } 214 if maxLen != 0 && len(s) > maxLen { 215 return s[0:maxLen] + "..." 216 } 217 return s 218 } 219 220 // Format implements the fmt.Formatter interface. 221 func (k Key) Format(f fmt.State, verb rune) { 222 // Note: this implementation doesn't handle the width and precision 223 // specifiers such as "%20.10s". 224 if verb == 'x' { 225 fmt.Fprintf(f, "%x", []byte(k)) 226 } else if PrettyPrintKey != nil { 227 fmt.Fprint(f, PrettyPrintKey(nil /* valDirs */, k)) 228 } else { 229 fmt.Fprint(f, strconv.Quote(string(k))) 230 } 231 } 232 233 const ( 234 checksumUninitialized = 0 235 checksumSize = 4 236 tagPos = checksumSize 237 headerSize = tagPos + 1 238 ) 239 240 func (v Value) checksum() uint32 { 241 if len(v.RawBytes) < checksumSize { 242 return 0 243 } 244 _, u, err := encoding.DecodeUint32Ascending(v.RawBytes[:checksumSize]) 245 if err != nil { 246 panic(err) 247 } 248 return u 249 } 250 251 func (v *Value) setChecksum(cksum uint32) { 252 if len(v.RawBytes) >= checksumSize { 253 encoding.EncodeUint32Ascending(v.RawBytes[:0], cksum) 254 } 255 } 256 257 // InitChecksum initializes a checksum based on the provided key and 258 // the contents of the value. If the value contains a byte slice, the 259 // checksum includes it directly. 260 // 261 // TODO(peter): This method should return an error if the Value is corrupted 262 // (e.g. the RawBytes field is > 0 but smaller than the header size). 263 func (v *Value) InitChecksum(key []byte) { 264 if v.RawBytes == nil { 265 return 266 } 267 // Should be uninitialized. 268 if v.checksum() != checksumUninitialized { 269 panic(fmt.Sprintf("initialized checksum = %x", v.checksum())) 270 } 271 v.setChecksum(v.computeChecksum(key)) 272 } 273 274 // ClearChecksum clears the checksum value. 275 func (v *Value) ClearChecksum() { 276 v.setChecksum(0) 277 } 278 279 // Verify verifies the value's Checksum matches a newly-computed 280 // checksum of the value's contents. If the value's Checksum is not 281 // set the verification is a noop. 282 func (v Value) Verify(key []byte) error { 283 if n := len(v.RawBytes); n > 0 && n < headerSize { 284 return fmt.Errorf("%s: invalid header size: %d", Key(key), n) 285 } 286 if sum := v.checksum(); sum != 0 { 287 if computedSum := v.computeChecksum(key); computedSum != sum { 288 return fmt.Errorf("%s: invalid checksum (%x) value [% x]", 289 Key(key), computedSum, v.RawBytes) 290 } 291 } 292 return nil 293 } 294 295 // ShallowClone returns a shallow clone of the receiver. 296 func (v *Value) ShallowClone() *Value { 297 if v == nil { 298 return nil 299 } 300 t := *v 301 return &t 302 } 303 304 // IsPresent returns true if the value is present (existent and not a tombstone). 305 func (v *Value) IsPresent() bool { 306 return v != nil && len(v.RawBytes) != 0 307 } 308 309 // MakeValueFromString returns a value with bytes and tag set. 310 func MakeValueFromString(s string) Value { 311 v := Value{} 312 v.SetString(s) 313 return v 314 } 315 316 // MakeValueFromBytes returns a value with bytes and tag set. 317 func MakeValueFromBytes(bs []byte) Value { 318 v := Value{} 319 v.SetBytes(bs) 320 return v 321 } 322 323 // MakeValueFromBytesAndTimestamp returns a value with bytes, timestamp and 324 // tag set. 325 func MakeValueFromBytesAndTimestamp(bs []byte, t hlc.Timestamp) Value { 326 v := Value{Timestamp: t} 327 v.SetBytes(bs) 328 return v 329 } 330 331 // GetTag retrieves the value type. 332 func (v Value) GetTag() ValueType { 333 if len(v.RawBytes) <= tagPos { 334 return ValueType_UNKNOWN 335 } 336 return ValueType(v.RawBytes[tagPos]) 337 } 338 339 func (v *Value) setTag(t ValueType) { 340 v.RawBytes[tagPos] = byte(t) 341 } 342 343 func (v Value) dataBytes() []byte { 344 return v.RawBytes[headerSize:] 345 } 346 347 func (v *Value) ensureRawBytes(size int) { 348 if cap(v.RawBytes) < size { 349 v.RawBytes = make([]byte, size) 350 return 351 } 352 v.RawBytes = v.RawBytes[:size] 353 v.setChecksum(checksumUninitialized) 354 } 355 356 // EqualData returns a boolean reporting whether the receiver and the parameter 357 // have equivalent byte values. This check ignores the optional checksum field 358 // in the Values' byte slices, returning only whether the Values have the same 359 // tag and encoded data. 360 // 361 // This method should be used whenever the raw bytes of two Values are being 362 // compared instead of comparing the RawBytes slices directly because it ignores 363 // the checksum header, which is optional. 364 func (v Value) EqualData(o Value) bool { 365 return bytes.Equal(v.RawBytes[checksumSize:], o.RawBytes[checksumSize:]) 366 } 367 368 // SetBytes sets the bytes and tag field of the receiver and clears the checksum. 369 func (v *Value) SetBytes(b []byte) { 370 v.ensureRawBytes(headerSize + len(b)) 371 copy(v.dataBytes(), b) 372 v.setTag(ValueType_BYTES) 373 } 374 375 // SetString sets the bytes and tag field of the receiver and clears the 376 // checksum. This is identical to SetBytes, but specialized for a string 377 // argument. 378 func (v *Value) SetString(s string) { 379 v.ensureRawBytes(headerSize + len(s)) 380 copy(v.dataBytes(), s) 381 v.setTag(ValueType_BYTES) 382 } 383 384 // SetFloat encodes the specified float64 value into the bytes field of the 385 // receiver, sets the tag and clears the checksum. 386 func (v *Value) SetFloat(f float64) { 387 v.ensureRawBytes(headerSize + 8) 388 encoding.EncodeUint64Ascending(v.RawBytes[headerSize:headerSize], math.Float64bits(f)) 389 v.setTag(ValueType_FLOAT) 390 } 391 392 // SetGeo encodes the specified geo value into the bytes field of the 393 // receiver, sets the tag and clears the checksum. 394 func (v *Value) SetGeo(so geopb.SpatialObject) error { 395 bytes, err := protoutil.Marshal(&so) 396 if err != nil { 397 return err 398 } 399 v.ensureRawBytes(headerSize + len(bytes)) 400 copy(v.dataBytes(), bytes) 401 v.setTag(ValueType_GEO) 402 return nil 403 } 404 405 // SetBool encodes the specified bool value into the bytes field of the 406 // receiver, sets the tag and clears the checksum. 407 func (v *Value) SetBool(b bool) { 408 // 0 or 1 will always encode to a 1-byte long varint. 409 v.ensureRawBytes(headerSize + 1) 410 i := int64(0) 411 if b { 412 i = 1 413 } 414 _ = binary.PutVarint(v.RawBytes[headerSize:], i) 415 v.setTag(ValueType_INT) 416 } 417 418 // SetInt encodes the specified int64 value into the bytes field of the 419 // receiver, sets the tag and clears the checksum. 420 func (v *Value) SetInt(i int64) { 421 v.ensureRawBytes(headerSize + binary.MaxVarintLen64) 422 n := binary.PutVarint(v.RawBytes[headerSize:], i) 423 v.RawBytes = v.RawBytes[:headerSize+n] 424 v.setTag(ValueType_INT) 425 } 426 427 // SetProto encodes the specified proto message into the bytes field of the 428 // receiver and clears the checksum. If the proto message is an 429 // InternalTimeSeriesData, the tag will be set to TIMESERIES rather than BYTES. 430 func (v *Value) SetProto(msg protoutil.Message) error { 431 // All of the Cockroach protos implement MarshalTo and Size. So we marshal 432 // directly into the Value.RawBytes field instead of allocating a separate 433 // []byte and copying. 434 v.ensureRawBytes(headerSize + msg.Size()) 435 if _, err := protoutil.MarshalTo(msg, v.RawBytes[headerSize:]); err != nil { 436 return err 437 } 438 // Special handling for timeseries data. 439 if _, ok := msg.(*InternalTimeSeriesData); ok { 440 v.setTag(ValueType_TIMESERIES) 441 } else { 442 v.setTag(ValueType_BYTES) 443 } 444 return nil 445 } 446 447 // SetTime encodes the specified time value into the bytes field of the 448 // receiver, sets the tag and clears the checksum. 449 func (v *Value) SetTime(t time.Time) { 450 const encodingSizeOverestimate = 11 451 v.ensureRawBytes(headerSize + encodingSizeOverestimate) 452 v.RawBytes = encoding.EncodeTimeAscending(v.RawBytes[:headerSize], t) 453 v.setTag(ValueType_TIME) 454 } 455 456 // SetTimeTZ encodes the specified time value into the bytes field of the 457 // receiver, sets the tag and clears the checksum. 458 func (v *Value) SetTimeTZ(t timetz.TimeTZ) { 459 v.ensureRawBytes(headerSize + encoding.EncodedTimeTZMaxLen) 460 v.RawBytes = encoding.EncodeTimeTZAscending(v.RawBytes[:headerSize], t) 461 v.setTag(ValueType_TIMETZ) 462 } 463 464 // SetDuration encodes the specified duration value into the bytes field of the 465 // receiver, sets the tag and clears the checksum. 466 func (v *Value) SetDuration(t duration.Duration) error { 467 var err error 468 v.ensureRawBytes(headerSize + encoding.EncodedDurationMaxLen) 469 v.RawBytes, err = encoding.EncodeDurationAscending(v.RawBytes[:headerSize], t) 470 if err != nil { 471 return err 472 } 473 v.setTag(ValueType_DURATION) 474 return nil 475 } 476 477 // SetBitArray encodes the specified bit array value into the bytes field of the 478 // receiver, sets the tag and clears the checksum. 479 func (v *Value) SetBitArray(t bitarray.BitArray) { 480 words, _ := t.EncodingParts() 481 v.ensureRawBytes(headerSize + encoding.NonsortingUvarintMaxLen + 8*len(words)) 482 v.RawBytes = encoding.EncodeUntaggedBitArrayValue(v.RawBytes[:headerSize], t) 483 v.setTag(ValueType_BITARRAY) 484 } 485 486 // SetDecimal encodes the specified decimal value into the bytes field of 487 // the receiver using Gob encoding, sets the tag and clears the checksum. 488 func (v *Value) SetDecimal(dec *apd.Decimal) error { 489 decSize := encoding.UpperBoundNonsortingDecimalSize(dec) 490 v.ensureRawBytes(headerSize + decSize) 491 v.RawBytes = encoding.EncodeNonsortingDecimal(v.RawBytes[:headerSize], dec) 492 v.setTag(ValueType_DECIMAL) 493 return nil 494 } 495 496 // SetTuple sets the tuple bytes and tag field of the receiver and clears the 497 // checksum. 498 func (v *Value) SetTuple(data []byte) { 499 v.ensureRawBytes(headerSize + len(data)) 500 copy(v.dataBytes(), data) 501 v.setTag(ValueType_TUPLE) 502 } 503 504 // GetBytes returns the bytes field of the receiver. If the tag is not 505 // BYTES an error will be returned. 506 func (v Value) GetBytes() ([]byte, error) { 507 if tag := v.GetTag(); tag != ValueType_BYTES { 508 return nil, fmt.Errorf("value type is not %s: %s", ValueType_BYTES, tag) 509 } 510 return v.dataBytes(), nil 511 } 512 513 // GetFloat decodes a float64 value from the bytes field of the receiver. If 514 // the bytes field is not 8 bytes in length or the tag is not FLOAT an error 515 // will be returned. 516 func (v Value) GetFloat() (float64, error) { 517 if tag := v.GetTag(); tag != ValueType_FLOAT { 518 return 0, fmt.Errorf("value type is not %s: %s", ValueType_FLOAT, tag) 519 } 520 dataBytes := v.dataBytes() 521 if len(dataBytes) != 8 { 522 return 0, fmt.Errorf("float64 value should be exactly 8 bytes: %d", len(dataBytes)) 523 } 524 _, u, err := encoding.DecodeUint64Ascending(dataBytes) 525 if err != nil { 526 return 0, err 527 } 528 return math.Float64frombits(u), nil 529 } 530 531 // GetGeo decodes a geo value from the bytes field of the receiver. If the 532 // tag is not GEO an error will be returned. 533 func (v Value) GetGeo() (geopb.SpatialObject, error) { 534 if tag := v.GetTag(); tag != ValueType_GEO { 535 return geopb.SpatialObject{}, fmt.Errorf("value type is not %s: %s", ValueType_GEO, tag) 536 } 537 var ret geopb.SpatialObject 538 err := protoutil.Unmarshal(v.dataBytes(), &ret) 539 return ret, err 540 } 541 542 // GetBool decodes a bool value from the bytes field of the receiver. If the 543 // tag is not INT (the tag used for bool values) or the value cannot be decoded 544 // an error will be returned. 545 func (v Value) GetBool() (bool, error) { 546 if tag := v.GetTag(); tag != ValueType_INT { 547 return false, fmt.Errorf("value type is not %s: %s", ValueType_INT, tag) 548 } 549 i, n := binary.Varint(v.dataBytes()) 550 if n <= 0 { 551 return false, fmt.Errorf("int64 varint decoding failed: %d", n) 552 } 553 if i > 1 || i < 0 { 554 return false, fmt.Errorf("invalid bool: %d", i) 555 } 556 return i != 0, nil 557 } 558 559 // GetInt decodes an int64 value from the bytes field of the receiver. If the 560 // tag is not INT or the value cannot be decoded an error will be returned. 561 func (v Value) GetInt() (int64, error) { 562 if tag := v.GetTag(); tag != ValueType_INT { 563 return 0, fmt.Errorf("value type is not %s: %s", ValueType_INT, tag) 564 } 565 i, n := binary.Varint(v.dataBytes()) 566 if n <= 0 { 567 return 0, fmt.Errorf("int64 varint decoding failed: %d", n) 568 } 569 return i, nil 570 } 571 572 // GetProto unmarshals the bytes field of the receiver into msg. If 573 // unmarshalling fails or the tag is not BYTES, an error will be 574 // returned. 575 func (v Value) GetProto(msg protoutil.Message) error { 576 expectedTag := ValueType_BYTES 577 578 // Special handling for ts data. 579 if _, ok := msg.(*InternalTimeSeriesData); ok { 580 expectedTag = ValueType_TIMESERIES 581 } 582 583 if tag := v.GetTag(); tag != expectedTag { 584 return fmt.Errorf("value type is not %s: %s", expectedTag, tag) 585 } 586 return protoutil.Unmarshal(v.dataBytes(), msg) 587 } 588 589 // GetTime decodes a time value from the bytes field of the receiver. If the 590 // tag is not TIME an error will be returned. 591 func (v Value) GetTime() (time.Time, error) { 592 if tag := v.GetTag(); tag != ValueType_TIME { 593 return time.Time{}, fmt.Errorf("value type is not %s: %s", ValueType_TIME, tag) 594 } 595 _, t, err := encoding.DecodeTimeAscending(v.dataBytes()) 596 return t, err 597 } 598 599 // GetTimeTZ decodes a time value from the bytes field of the receiver. If the 600 // tag is not TIMETZ an error will be returned. 601 func (v Value) GetTimeTZ() (timetz.TimeTZ, error) { 602 if tag := v.GetTag(); tag != ValueType_TIMETZ { 603 return timetz.TimeTZ{}, fmt.Errorf("value type is not %s: %s", ValueType_TIMETZ, tag) 604 } 605 _, t, err := encoding.DecodeTimeTZAscending(v.dataBytes()) 606 return t, err 607 } 608 609 // GetDuration decodes a duration value from the bytes field of the receiver. If 610 // the tag is not DURATION an error will be returned. 611 func (v Value) GetDuration() (duration.Duration, error) { 612 if tag := v.GetTag(); tag != ValueType_DURATION { 613 return duration.Duration{}, fmt.Errorf("value type is not %s: %s", ValueType_DURATION, tag) 614 } 615 _, t, err := encoding.DecodeDurationAscending(v.dataBytes()) 616 return t, err 617 } 618 619 // GetBitArray decodes a bit array value from the bytes field of the receiver. If 620 // the tag is not BITARRAY an error will be returned. 621 func (v Value) GetBitArray() (bitarray.BitArray, error) { 622 if tag := v.GetTag(); tag != ValueType_BITARRAY { 623 return bitarray.BitArray{}, fmt.Errorf("value type is not %s: %s", ValueType_BITARRAY, tag) 624 } 625 _, t, err := encoding.DecodeUntaggedBitArrayValue(v.dataBytes()) 626 return t, err 627 } 628 629 // GetDecimal decodes a decimal value from the bytes of the receiver. If the 630 // tag is not DECIMAL an error will be returned. 631 func (v Value) GetDecimal() (apd.Decimal, error) { 632 if tag := v.GetTag(); tag != ValueType_DECIMAL { 633 return apd.Decimal{}, fmt.Errorf("value type is not %s: %s", ValueType_DECIMAL, tag) 634 } 635 return encoding.DecodeNonsortingDecimal(v.dataBytes(), nil) 636 } 637 638 // GetDecimalInto decodes a decimal value from the bytes of the receiver, 639 // writing it directly into the provided non-null apd.Decimal. If the 640 // tag is not DECIMAL an error will be returned. 641 func (v Value) GetDecimalInto(d *apd.Decimal) error { 642 if tag := v.GetTag(); tag != ValueType_DECIMAL { 643 return fmt.Errorf("value type is not %s: %s", ValueType_DECIMAL, tag) 644 } 645 return encoding.DecodeIntoNonsortingDecimal(d, v.dataBytes(), nil) 646 } 647 648 // GetTimeseries decodes an InternalTimeSeriesData value from the bytes 649 // field of the receiver. An error will be returned if the tag is not 650 // TIMESERIES or if decoding fails. 651 func (v Value) GetTimeseries() (InternalTimeSeriesData, error) { 652 ts := InternalTimeSeriesData{} 653 // GetProto mutates its argument. `return ts, v.GetProto(&ts)` 654 // happens to work in gc, but does not work in gccgo. 655 // 656 // See https://github.com/golang/go/issues/23188. 657 err := v.GetProto(&ts) 658 return ts, err 659 } 660 661 // GetTuple returns the tuple bytes of the receiver. If the tag is not TUPLE an 662 // error will be returned. 663 func (v Value) GetTuple() ([]byte, error) { 664 if tag := v.GetTag(); tag != ValueType_TUPLE { 665 return nil, fmt.Errorf("value type is not %s: %s", ValueType_TUPLE, tag) 666 } 667 return v.dataBytes(), nil 668 } 669 670 var crc32Pool = sync.Pool{ 671 New: func() interface{} { 672 return crc32.NewIEEE() 673 }, 674 } 675 676 func computeChecksum(key, rawBytes []byte, crc hash.Hash32) uint32 { 677 if len(rawBytes) < headerSize { 678 return 0 679 } 680 if _, err := crc.Write(key); err != nil { 681 panic(err) 682 } 683 if _, err := crc.Write(rawBytes[checksumSize:]); err != nil { 684 panic(err) 685 } 686 sum := crc.Sum32() 687 crc.Reset() 688 // We reserved the value 0 (checksumUninitialized) to indicate that a checksum 689 // has not been initialized. This reservation is accomplished by folding a 690 // computed checksum of 0 to the value 1. 691 if sum == checksumUninitialized { 692 return 1 693 } 694 return sum 695 } 696 697 // computeChecksum computes a checksum based on the provided key and 698 // the contents of the value. 699 func (v Value) computeChecksum(key []byte) uint32 { 700 crc := crc32Pool.Get().(hash.Hash32) 701 sum := computeChecksum(key, v.RawBytes, crc) 702 crc32Pool.Put(crc) 703 return sum 704 } 705 706 // PrettyPrint returns the value in a human readable format. 707 // e.g. `Put /Table/51/1/1/0 -> /TUPLE/2:2:Int/7/1:3:Float/6.28` 708 // In `1:3:Float/6.28`, the `1` is the column id diff as stored, `3` is the 709 // computed (i.e. not stored) actual column id, `Float` is the type, and `6.28` 710 // is the encoded value. 711 func (v Value) PrettyPrint() string { 712 if len(v.RawBytes) == 0 { 713 return "/<empty>" 714 } 715 var buf bytes.Buffer 716 t := v.GetTag() 717 buf.WriteRune('/') 718 buf.WriteString(t.String()) 719 buf.WriteRune('/') 720 721 var err error 722 switch t { 723 case ValueType_TUPLE: 724 b := v.dataBytes() 725 var colID uint32 726 for i := 0; len(b) > 0; i++ { 727 if i != 0 { 728 buf.WriteRune('/') 729 } 730 _, _, colIDDiff, typ, err := encoding.DecodeValueTag(b) 731 if err != nil { 732 break 733 } 734 colID += colIDDiff 735 var s string 736 b, s, err = encoding.PrettyPrintValueEncoded(b) 737 if err != nil { 738 break 739 } 740 fmt.Fprintf(&buf, "%d:%d:%s/%s", colIDDiff, colID, typ, s) 741 } 742 case ValueType_INT: 743 var i int64 744 i, err = v.GetInt() 745 buf.WriteString(strconv.FormatInt(i, 10)) 746 case ValueType_FLOAT: 747 var f float64 748 f, err = v.GetFloat() 749 buf.WriteString(strconv.FormatFloat(f, 'g', -1, 64)) 750 case ValueType_BYTES: 751 var data []byte 752 data, err = v.GetBytes() 753 if encoding.PrintableBytes(data) { 754 buf.WriteString(string(data)) 755 } else { 756 buf.WriteString("0x") 757 buf.WriteString(hex.EncodeToString(data)) 758 } 759 case ValueType_BITARRAY: 760 var data bitarray.BitArray 761 data, err = v.GetBitArray() 762 buf.WriteByte('B') 763 data.Format(&buf) 764 case ValueType_TIME: 765 var t time.Time 766 t, err = v.GetTime() 767 buf.WriteString(t.UTC().Format(time.RFC3339Nano)) 768 case ValueType_DECIMAL: 769 var d apd.Decimal 770 d, err = v.GetDecimal() 771 buf.WriteString(d.String()) 772 case ValueType_DURATION: 773 var d duration.Duration 774 d, err = v.GetDuration() 775 buf.WriteString(d.StringNanos()) 776 default: 777 err = errors.Errorf("unknown tag: %s", t) 778 } 779 if err != nil { 780 // Ignore the contents of buf and return directly. 781 return fmt.Sprintf("/<err: %s>", err) 782 } 783 return buf.String() 784 } 785 786 // IsFinalized determines whether the transaction status is in a finalized 787 // state. A finalized state is terminal, meaning that once a transaction 788 // enters one of these states, it will never leave it. 789 func (ts TransactionStatus) IsFinalized() bool { 790 return ts == COMMITTED || ts == ABORTED 791 } 792 793 // IsCommittedOrStaging determines if the transaction is morally committed (i.e. 794 // in the COMMITTED or STAGING state). 795 func (ts TransactionStatus) IsCommittedOrStaging() bool { 796 return ts == COMMITTED || ts == STAGING 797 } 798 799 var _ errors.SafeMessager = Transaction{} 800 801 // MakeTransaction creates a new transaction. The transaction key is 802 // composed using the specified baseKey (for locality with data 803 // affected by the transaction) and a random ID to guarantee 804 // uniqueness. The specified user-level priority is combined with a 805 // randomly chosen value to yield a final priority, used to settle 806 // write conflicts in a way that avoids starvation of long-running 807 // transactions (see Replica.PushTxn). 808 // 809 // baseKey can be nil, in which case it will be set when sending the first 810 // write. 811 func MakeTransaction( 812 name string, baseKey Key, userPriority UserPriority, now hlc.Timestamp, maxOffsetNs int64, 813 ) Transaction { 814 u := uuid.FastMakeV4() 815 maxTS := now.Add(maxOffsetNs, 0) 816 817 return Transaction{ 818 TxnMeta: enginepb.TxnMeta{ 819 Key: baseKey, 820 ID: u, 821 WriteTimestamp: now, 822 MinTimestamp: now, 823 Priority: MakePriority(userPriority), 824 Sequence: 0, // 1-indexed, incremented before each Request 825 }, 826 Name: name, 827 LastHeartbeat: now, 828 ReadTimestamp: now, 829 MaxTimestamp: maxTS, 830 DeprecatedOrigTimestamp: now, // For compatibility with 19.2. 831 } 832 } 833 834 // LastActive returns the last timestamp at which client activity definitely 835 // occurred, i.e. the maximum of ReadTimestamp and LastHeartbeat. 836 func (t Transaction) LastActive() hlc.Timestamp { 837 ts := t.LastHeartbeat 838 ts.Forward(t.ReadTimestamp) 839 840 // For compatibility with 19.2, handle the case where ReadTimestamp isn't 841 // set. 842 ts.Forward(t.DeprecatedOrigTimestamp) 843 return ts 844 } 845 846 // Clone creates a copy of the given transaction. The copy is shallow because 847 // none of the references held by a transaction allow interior mutability. 848 func (t Transaction) Clone() *Transaction { 849 return &t 850 } 851 852 // AssertInitialized crashes if the transaction is not initialized. 853 func (t *Transaction) AssertInitialized(ctx context.Context) { 854 if t.ID == (uuid.UUID{}) || t.WriteTimestamp == (hlc.Timestamp{}) { 855 log.Fatalf(ctx, "uninitialized txn: %s", *t) 856 } 857 } 858 859 // MakePriority generates a random priority value, biased by the specified 860 // userPriority. If userPriority=100, the random priority will be 100x more 861 // likely to be greater than if userPriority=1. If userPriority = 0.1, the 862 // random priority will be 1/10th as likely to be greater than if 863 // userPriority=NormalUserPriority ( = 1). Balance is achieved when 864 // userPriority=NormalUserPriority, in which case the priority chosen is 865 // unbiased. 866 // 867 // If userPriority is less than or equal to MinUserPriority, returns 868 // MinTxnPriority; if greater than or equal to MaxUserPriority, returns 869 // MaxTxnPriority. If userPriority is 0, returns NormalUserPriority. 870 func MakePriority(userPriority UserPriority) enginepb.TxnPriority { 871 // A currently undocumented feature allows an explicit priority to 872 // be set by specifying priority < 1. The explicit priority is 873 // simply -userPriority in this case. This is hacky, but currently 874 // used for unittesting. Perhaps this should be documented and allowed. 875 if userPriority < 0 { 876 if -userPriority > UserPriority(math.MaxInt32) { 877 panic(fmt.Sprintf("cannot set explicit priority to a value less than -%d", math.MaxInt32)) 878 } 879 return enginepb.TxnPriority(-userPriority) 880 } else if userPriority == 0 { 881 userPriority = NormalUserPriority 882 } else if userPriority >= MaxUserPriority { 883 return enginepb.MaxTxnPriority 884 } else if userPriority <= MinUserPriority { 885 return enginepb.MinTxnPriority 886 } 887 888 // We generate random values which are biased according to priorities. If v1 is a value 889 // generated for priority p1 and v2 is a value of priority v2, we want the ratio of wins vs 890 // losses to be the same with the ratio of priorities: 891 // 892 // P[ v1 > v2 ] p1 p1 893 // ------------ = -- or, equivalently: P[ v1 > v2 ] = ------- 894 // P[ v2 < v1 ] p2 p1 + p2 895 // 896 // 897 // For example, priority 10 wins 10 out of 11 times over priority 1, and it wins 100 out of 101 898 // times over priority 0.1. 899 // 900 // 901 // We use the exponential distribution. This distribution has the probability density function 902 // PDF_lambda(x) = lambda * exp(-lambda * x) 903 // and the cumulative distribution function (i.e. probability that a random value is smaller 904 // than x): 905 // CDF_lambda(x) = Integral_0^x PDF_lambda(x) dx 906 // = 1 - exp(-lambda * x) 907 // 908 // Let's assume we generate x from the exponential distribution with the lambda rate set to 909 // l1 and we generate y from the distribution with the rate set to l2. The probability that x 910 // wins is: 911 // P[ x > y ] = Integral_0^inf Integral_0^x PDF_l1(x) PDF_l2(y) dy dx 912 // = Integral_0^inf PDF_l1(x) Integral_0^x PDF_l2(y) dy dx 913 // = Integral_0^inf PDF_l1(x) CDF_l2(x) dx 914 // = Integral_0^inf PDF_l1(x) (1 - exp(-l2 * x)) dx 915 // = 1 - Integral_0^inf l1 * exp(-(l1+l2) * x) dx 916 // = 1 - l1 / (l1 + l2) * Integral_0^inf PDF_(l1+l2)(x) dx 917 // = 1 - l1 / (l1 + l2) 918 // = l2 / (l1 + l2) 919 // 920 // We want this probability to be p1 / (p1 + p2) which we can get by setting 921 // l1 = 1 / p1 922 // l2 = 1 / p2 923 // It's easy to verify that (1/p2) / (1/p1 + 1/p2) = p1 / (p2 + p1). 924 // 925 // We can generate an exponentially distributed value using (rand.ExpFloat64() / lambda). 926 // In our case this works out to simply rand.ExpFloat64() * userPriority. 927 val := rand.ExpFloat64() * float64(userPriority) 928 929 // To convert to an integer, we scale things to accommodate a few (5) standard deviations for 930 // the maximum priority. The choice of the value is a trade-off between loss of resolution for 931 // low priorities and overflow (capping the value to MaxInt32) for high priorities. 932 // 933 // For userPriority=MaxUserPriority, the probability of overflow is 0.7%. 934 // For userPriority=(MaxUserPriority/2), the probability of overflow is 0.005%. 935 val = (val / (5 * float64(MaxUserPriority))) * math.MaxInt32 936 if val < float64(enginepb.MinTxnPriority+1) { 937 return enginepb.MinTxnPriority + 1 938 } else if val > float64(enginepb.MaxTxnPriority-1) { 939 return enginepb.MaxTxnPriority - 1 940 } 941 return enginepb.TxnPriority(val) 942 } 943 944 // Restart reconfigures a transaction for restart. The epoch is 945 // incremented for an in-place restart. The timestamp of the 946 // transaction on restart is set to the maximum of the transaction's 947 // timestamp and the specified timestamp. 948 func (t *Transaction) Restart( 949 userPriority UserPriority, upgradePriority enginepb.TxnPriority, timestamp hlc.Timestamp, 950 ) { 951 t.BumpEpoch() 952 if t.WriteTimestamp.Less(timestamp) { 953 t.WriteTimestamp = timestamp 954 } 955 t.ReadTimestamp = t.WriteTimestamp 956 t.DeprecatedOrigTimestamp = t.WriteTimestamp // For 19.2 compatibility. 957 // Upgrade priority to the maximum of: 958 // - the current transaction priority 959 // - a random priority created from userPriority 960 // - the conflicting transaction's upgradePriority 961 t.UpgradePriority(MakePriority(userPriority)) 962 t.UpgradePriority(upgradePriority) 963 // Reset all epoch-scoped state. 964 t.Sequence = 0 965 t.WriteTooOld = false 966 t.CommitTimestampFixed = false 967 t.LockSpans = nil 968 t.InFlightWrites = nil 969 t.IgnoredSeqNums = nil 970 } 971 972 // BumpEpoch increments the transaction's epoch, allowing for an in-place 973 // restart. This invalidates all write intents previously written at lower 974 // epochs. 975 func (t *Transaction) BumpEpoch() { 976 t.Epoch++ 977 } 978 979 // Update ratchets priority, timestamp and original timestamp values (among 980 // others) for the transaction. If t.ID is empty, then the transaction is 981 // copied from o. 982 func (t *Transaction) Update(o *Transaction) { 983 if o == nil { 984 return 985 } 986 o.AssertInitialized(context.TODO()) 987 if t.ID == (uuid.UUID{}) { 988 *t = *o 989 return 990 } else if t.ID != o.ID { 991 log.Fatalf(context.Background(), "updating txn %s with different txn %s", t.String(), o.String()) 992 return 993 } 994 if len(t.Key) == 0 { 995 t.Key = o.Key 996 } 997 998 // Update epoch-scoped state, depending on the two transactions' epochs. 999 if t.Epoch < o.Epoch { 1000 // Replace all epoch-scoped state. 1001 t.Epoch = o.Epoch 1002 t.Status = o.Status 1003 t.WriteTooOld = o.WriteTooOld 1004 t.CommitTimestampFixed = o.CommitTimestampFixed 1005 t.Sequence = o.Sequence 1006 t.LockSpans = o.LockSpans 1007 t.InFlightWrites = o.InFlightWrites 1008 t.IgnoredSeqNums = o.IgnoredSeqNums 1009 } else if t.Epoch == o.Epoch { 1010 // Forward all epoch-scoped state. 1011 switch t.Status { 1012 case PENDING: 1013 t.Status = o.Status 1014 case STAGING: 1015 if o.Status != PENDING { 1016 t.Status = o.Status 1017 } 1018 case ABORTED: 1019 if o.Status == COMMITTED { 1020 log.Warningf(context.Background(), "updating ABORTED txn %s with COMMITTED txn %s", t.String(), o.String()) 1021 } 1022 case COMMITTED: 1023 // Nothing to do. 1024 } 1025 1026 if t.ReadTimestamp.Equal(o.ReadTimestamp) { 1027 // If neither of the transactions has a bumped ReadTimestamp, then the 1028 // WriteTooOld flag is cumulative. 1029 t.WriteTooOld = t.WriteTooOld || o.WriteTooOld 1030 t.CommitTimestampFixed = t.CommitTimestampFixed || o.CommitTimestampFixed 1031 } else if t.ReadTimestamp.Less(o.ReadTimestamp) { 1032 // If `o` has a higher ReadTimestamp (i.e. it's the result of a refresh, 1033 // which refresh generally clears the WriteTooOld field), then it dictates 1034 // the WriteTooOld field. This relies on refreshes not being performed 1035 // concurrently with any requests whose response's WriteTooOld field 1036 // matters. 1037 t.WriteTooOld = o.WriteTooOld 1038 t.CommitTimestampFixed = o.CommitTimestampFixed 1039 } 1040 // If t has a higher ReadTimestamp, than it gets to dictate the 1041 // WriteTooOld field - so there's nothing to update. 1042 1043 if t.Sequence < o.Sequence { 1044 t.Sequence = o.Sequence 1045 } 1046 if len(o.LockSpans) > 0 { 1047 t.LockSpans = o.LockSpans 1048 } 1049 if len(o.InFlightWrites) > 0 { 1050 t.InFlightWrites = o.InFlightWrites 1051 } 1052 if len(o.IgnoredSeqNums) > 0 { 1053 t.IgnoredSeqNums = o.IgnoredSeqNums 1054 } 1055 } else /* t.Epoch > o.Epoch */ { 1056 // Ignore epoch-specific state from previous epoch. However, ensure that 1057 // the transaction status still makes sense. 1058 switch o.Status { 1059 case ABORTED: 1060 // Once aborted, always aborted. The transaction coordinator might 1061 // have incremented the txn's epoch without realizing that it was 1062 // aborted. 1063 t.Status = ABORTED 1064 case COMMITTED: 1065 log.Warningf(context.Background(), "updating txn %s with COMMITTED txn at earlier epoch %s", t.String(), o.String()) 1066 } 1067 } 1068 1069 // Forward each of the transaction timestamps. 1070 t.WriteTimestamp.Forward(o.WriteTimestamp) 1071 t.LastHeartbeat.Forward(o.LastHeartbeat) 1072 t.DeprecatedOrigTimestamp.Forward(o.DeprecatedOrigTimestamp) 1073 t.MaxTimestamp.Forward(o.MaxTimestamp) 1074 t.ReadTimestamp.Forward(o.ReadTimestamp) 1075 1076 // On update, set lower bound timestamps to the minimum seen by either txn. 1077 // These shouldn't differ unless one of them is empty, but we're careful 1078 // anyway. 1079 if t.MinTimestamp == (hlc.Timestamp{}) { 1080 t.MinTimestamp = o.MinTimestamp 1081 } else if o.MinTimestamp != (hlc.Timestamp{}) { 1082 t.MinTimestamp.Backward(o.MinTimestamp) 1083 } 1084 1085 // Absorb the collected clock uncertainty information. 1086 for _, v := range o.ObservedTimestamps { 1087 t.UpdateObservedTimestamp(v.NodeID, v.Timestamp) 1088 } 1089 1090 // Ratchet the transaction priority. 1091 t.UpgradePriority(o.Priority) 1092 } 1093 1094 // UpgradePriority sets transaction priority to the maximum of current 1095 // priority and the specified minPriority. The exception is if the 1096 // current priority is set to the minimum, in which case the minimum 1097 // is preserved. 1098 func (t *Transaction) UpgradePriority(minPriority enginepb.TxnPriority) { 1099 if minPriority > t.Priority && t.Priority != enginepb.MinTxnPriority { 1100 t.Priority = minPriority 1101 } 1102 } 1103 1104 // IsLocking returns whether the transaction has begun acquiring locks. 1105 // This method will never return false for a writing transaction. 1106 func (t *Transaction) IsLocking() bool { 1107 return t.Key != nil 1108 } 1109 1110 // String formats transaction into human readable string. 1111 // 1112 // NOTE: When updating String(), you probably want to also update SafeMessage(). 1113 func (t Transaction) String() string { 1114 var buf strings.Builder 1115 if len(t.Name) > 0 { 1116 fmt.Fprintf(&buf, "%q ", t.Name) 1117 } 1118 fmt.Fprintf(&buf, "meta={%s} lock=%t stat=%s rts=%s wto=%t max=%s", 1119 t.TxnMeta, t.IsLocking(), t.Status, t.ReadTimestamp, t.WriteTooOld, t.MaxTimestamp) 1120 if ni := len(t.LockSpans); t.Status != PENDING && ni > 0 { 1121 fmt.Fprintf(&buf, " int=%d", ni) 1122 } 1123 if nw := len(t.InFlightWrites); t.Status != PENDING && nw > 0 { 1124 fmt.Fprintf(&buf, " ifw=%d", nw) 1125 } 1126 if ni := len(t.IgnoredSeqNums); ni > 0 { 1127 fmt.Fprintf(&buf, " isn=%d", ni) 1128 } 1129 return buf.String() 1130 } 1131 1132 // SafeMessage implements the SafeMessager interface. 1133 // 1134 // This method should be kept largely synchronized with String(), except that it 1135 // can't include sensitive info (e.g. the transaction key). 1136 func (t Transaction) SafeMessage() string { 1137 var buf strings.Builder 1138 if len(t.Name) > 0 { 1139 fmt.Fprintf(&buf, "%q ", t.Name) 1140 } 1141 fmt.Fprintf(&buf, "meta={%s} lock=%t stat=%s rts=%s wto=%t max=%s", 1142 t.TxnMeta.SafeMessage(), t.IsLocking(), t.Status, t.ReadTimestamp, t.WriteTooOld, t.MaxTimestamp) 1143 if ni := len(t.LockSpans); t.Status != PENDING && ni > 0 { 1144 fmt.Fprintf(&buf, " int=%d", ni) 1145 } 1146 if nw := len(t.InFlightWrites); t.Status != PENDING && nw > 0 { 1147 fmt.Fprintf(&buf, " ifw=%d", nw) 1148 } 1149 if ni := len(t.IgnoredSeqNums); ni > 0 { 1150 fmt.Fprintf(&buf, " isn=%d", ni) 1151 } 1152 return buf.String() 1153 } 1154 1155 // ResetObservedTimestamps clears out all timestamps recorded from individual 1156 // nodes. 1157 func (t *Transaction) ResetObservedTimestamps() { 1158 t.ObservedTimestamps = nil 1159 } 1160 1161 // UpdateObservedTimestamp stores a timestamp off a node's clock for future 1162 // operations in the transaction. When multiple calls are made for a single 1163 // nodeID, the lowest timestamp prevails. 1164 func (t *Transaction) UpdateObservedTimestamp(nodeID NodeID, maxTS hlc.Timestamp) { 1165 // Fast path optimization for either no observed timestamps or 1166 // exactly one, for the same nodeID as we're updating. 1167 if l := len(t.ObservedTimestamps); l == 0 { 1168 t.ObservedTimestamps = []ObservedTimestamp{{NodeID: nodeID, Timestamp: maxTS}} 1169 return 1170 } else if l == 1 && t.ObservedTimestamps[0].NodeID == nodeID { 1171 if maxTS.Less(t.ObservedTimestamps[0].Timestamp) { 1172 t.ObservedTimestamps = []ObservedTimestamp{{NodeID: nodeID, Timestamp: maxTS}} 1173 } 1174 return 1175 } 1176 s := observedTimestampSlice(t.ObservedTimestamps) 1177 t.ObservedTimestamps = s.update(nodeID, maxTS) 1178 } 1179 1180 // GetObservedTimestamp returns the lowest HLC timestamp recorded from the 1181 // given node's clock during the transaction. The returned boolean is false if 1182 // no observation about the requested node was found. Otherwise, MaxTimestamp 1183 // can be lowered to the returned timestamp when reading from nodeID. 1184 func (t *Transaction) GetObservedTimestamp(nodeID NodeID) (hlc.Timestamp, bool) { 1185 s := observedTimestampSlice(t.ObservedTimestamps) 1186 return s.get(nodeID) 1187 } 1188 1189 // AddIgnoredSeqNumRange adds the given range to the given list of 1190 // ignored seqnum ranges. Since none of the references held by a Transaction 1191 // allow interior mutations, the existing list is copied instead of being 1192 // mutated in place. 1193 // 1194 // The following invariants are assumed to hold and are preserved: 1195 // - the list contains no overlapping ranges 1196 // - the list contains no contiguous ranges 1197 // - the list is sorted, with larger seqnums at the end 1198 // 1199 // Additionally, the caller must ensure: 1200 // 1201 // 1) if the new range overlaps with some range in the list, then it 1202 // also overlaps with every subsequent range in the list. 1203 // 1204 // 2) the new range's "end" seqnum is larger or equal to the "end" 1205 // seqnum of the last element in the list. 1206 // 1207 // For example: 1208 // current list [3 5] [10 20] [22 24] 1209 // new item: [8 26] 1210 // final list: [3 5] [8 26] 1211 // 1212 // current list [3 5] [10 20] [22 24] 1213 // new item: [28 32] 1214 // final list: [3 5] [10 20] [22 24] [28 32] 1215 // 1216 // This corresponds to savepoints semantics: 1217 // 1218 // - Property 1 says that a rollback to an earlier savepoint 1219 // rolls back over all writes following that savepoint. 1220 // - Property 2 comes from that the new range's 'end' seqnum is the 1221 // current write seqnum and thus larger than or equal to every 1222 // previously seen value. 1223 func (t *Transaction) AddIgnoredSeqNumRange(newRange enginepb.IgnoredSeqNumRange) { 1224 // Truncate the list at the last element not included in the new range. 1225 1226 list := t.IgnoredSeqNums 1227 i := sort.Search(len(list), func(i int) bool { 1228 return list[i].End >= newRange.Start 1229 }) 1230 1231 cpy := make([]enginepb.IgnoredSeqNumRange, i+1) 1232 copy(cpy[:i], list[:i]) 1233 cpy[i] = newRange 1234 t.IgnoredSeqNums = cpy 1235 } 1236 1237 // AsRecord returns a TransactionRecord object containing only the subset of 1238 // fields from the receiver that must be persisted in the transaction record. 1239 func (t *Transaction) AsRecord() TransactionRecord { 1240 var tr TransactionRecord 1241 tr.TxnMeta = t.TxnMeta 1242 tr.Status = t.Status 1243 tr.LastHeartbeat = t.LastHeartbeat 1244 tr.LockSpans = t.LockSpans 1245 tr.InFlightWrites = t.InFlightWrites 1246 tr.IgnoredSeqNums = t.IgnoredSeqNums 1247 return tr 1248 } 1249 1250 // AsTransaction returns a Transaction object containing populated fields for 1251 // state in the transaction record and empty fields for state omitted from the 1252 // transaction record. 1253 func (tr *TransactionRecord) AsTransaction() Transaction { 1254 var t Transaction 1255 t.TxnMeta = tr.TxnMeta 1256 t.Status = tr.Status 1257 t.LastHeartbeat = tr.LastHeartbeat 1258 t.LockSpans = tr.LockSpans 1259 t.InFlightWrites = tr.InFlightWrites 1260 t.IgnoredSeqNums = tr.IgnoredSeqNums 1261 return t 1262 } 1263 1264 // PrepareTransactionForRetry returns a new Transaction to be used for retrying 1265 // the original Transaction. Depending on the error, this might return an 1266 // already-existing Transaction with an incremented epoch, or a completely new 1267 // Transaction. 1268 // 1269 // The caller should generally check that the error was 1270 // meant for this Transaction before calling this. 1271 // 1272 // pri is the priority that should be used when giving the restarted transaction 1273 // the chance to get a higher priority. Not used when the transaction is being 1274 // aborted. 1275 // 1276 // In case retryErr tells us that a new Transaction needs to be created, 1277 // isolation and name help initialize this new transaction. 1278 func PrepareTransactionForRetry( 1279 ctx context.Context, pErr *Error, pri UserPriority, clock *hlc.Clock, 1280 ) Transaction { 1281 if pErr.TransactionRestart == TransactionRestart_NONE { 1282 log.Fatalf(ctx, "invalid retryable err (%T): %s", pErr.GetDetail(), pErr) 1283 } 1284 1285 if pErr.GetTxn() == nil { 1286 log.Fatalf(ctx, "missing txn for retryable error: %s", pErr) 1287 } 1288 1289 txn := *pErr.GetTxn() 1290 aborted := false 1291 switch tErr := pErr.GetDetail().(type) { 1292 case *TransactionAbortedError: 1293 // The txn coming with a TransactionAbortedError is not supposed to be used 1294 // for the restart. Instead, a brand new transaction is created. 1295 aborted = true 1296 // TODO(andrei): Should we preserve the ObservedTimestamps across the 1297 // restart? 1298 errTxnPri := txn.Priority 1299 // Start the new transaction at the current time from the local clock. 1300 // The local hlc should have been advanced to at least the error's 1301 // timestamp already. 1302 now := clock.Now() 1303 txn = MakeTransaction( 1304 txn.Name, 1305 nil, // baseKey 1306 // We have errTxnPri, but this wants a UserPriority. So we're going to 1307 // overwrite the priority below. 1308 NormalUserPriority, 1309 now, 1310 clock.MaxOffset().Nanoseconds(), 1311 ) 1312 // Use the priority communicated back by the server. 1313 txn.Priority = errTxnPri 1314 case *ReadWithinUncertaintyIntervalError: 1315 txn.WriteTimestamp.Forward( 1316 readWithinUncertaintyIntervalRetryTimestamp(ctx, &txn, tErr, pErr.OriginNode)) 1317 case *TransactionPushError: 1318 // Increase timestamp if applicable, ensuring that we're just ahead of 1319 // the pushee. 1320 txn.WriteTimestamp.Forward(tErr.PusheeTxn.WriteTimestamp) 1321 txn.UpgradePriority(tErr.PusheeTxn.Priority - 1) 1322 case *TransactionRetryError: 1323 // Nothing to do. Transaction.Timestamp has already been forwarded to be 1324 // ahead of any timestamp cache entries or newer versions which caused 1325 // the restart. 1326 case *WriteTooOldError: 1327 // Increase the timestamp to the ts at which we've actually written. 1328 txn.WriteTimestamp.Forward(writeTooOldRetryTimestamp(&txn, tErr)) 1329 default: 1330 log.Fatalf(ctx, "invalid retryable err (%T): %s", pErr.GetDetail(), pErr) 1331 } 1332 if !aborted { 1333 if txn.Status.IsFinalized() { 1334 log.Fatalf(ctx, "transaction unexpectedly finalized in (%T): %s", pErr.GetDetail(), pErr) 1335 } 1336 txn.Restart(pri, txn.Priority, txn.WriteTimestamp) 1337 } 1338 return txn 1339 } 1340 1341 // CanTransactionRetryAtRefreshedTimestamp returns whether the transaction 1342 // specified in the supplied error can be retried at a refreshed timestamp to 1343 // avoid a client-side transaction restart. If true, returns a cloned, updated 1344 // Transaction object with the provisional commit timestamp and refreshed 1345 // timestamp set appropriately. 1346 func CanTransactionRetryAtRefreshedTimestamp( 1347 ctx context.Context, pErr *Error, 1348 ) (bool, *Transaction) { 1349 txn := pErr.GetTxn() 1350 if txn == nil || txn.CommitTimestampFixed { 1351 return false, nil 1352 } 1353 timestamp := txn.WriteTimestamp 1354 switch err := pErr.GetDetail().(type) { 1355 case *TransactionRetryError: 1356 if err.Reason != RETRY_SERIALIZABLE && err.Reason != RETRY_WRITE_TOO_OLD { 1357 return false, nil 1358 } 1359 case *WriteTooOldError: 1360 // TODO(andrei): Chances of success for on write-too-old conditions might be 1361 // usually small: if our txn previously read the key that generated this 1362 // error, obviously the refresh will fail. It might be worth trying to 1363 // detect these cases and save the futile attempt; we'd need to have access 1364 // to the key that generated the error. 1365 timestamp.Forward(writeTooOldRetryTimestamp(txn, err)) 1366 case *ReadWithinUncertaintyIntervalError: 1367 timestamp.Forward( 1368 readWithinUncertaintyIntervalRetryTimestamp(ctx, txn, err, pErr.OriginNode)) 1369 default: 1370 return false, nil 1371 } 1372 1373 newTxn := txn.Clone() 1374 newTxn.WriteTimestamp.Forward(timestamp) 1375 newTxn.ReadTimestamp.Forward(newTxn.WriteTimestamp) 1376 newTxn.WriteTooOld = false 1377 1378 return true, newTxn 1379 } 1380 1381 func readWithinUncertaintyIntervalRetryTimestamp( 1382 ctx context.Context, txn *Transaction, err *ReadWithinUncertaintyIntervalError, origin NodeID, 1383 ) hlc.Timestamp { 1384 // If the reader encountered a newer write within the uncertainty 1385 // interval, we advance the txn's timestamp just past the last observed 1386 // timestamp from the node. 1387 ts, ok := txn.GetObservedTimestamp(origin) 1388 if !ok { 1389 log.Fatalf(ctx, 1390 "missing observed timestamp for node %d found on uncertainty restart. "+ 1391 "err: %s. txn: %s. Observed timestamps: %v", 1392 origin, err, txn, txn.ObservedTimestamps) 1393 } 1394 // Also forward by the existing timestamp. 1395 ts.Forward(err.ExistingTimestamp.Next()) 1396 return ts 1397 } 1398 1399 func writeTooOldRetryTimestamp(txn *Transaction, err *WriteTooOldError) hlc.Timestamp { 1400 return err.ActualTimestamp 1401 } 1402 1403 // Replicas returns all of the replicas present in the descriptor after this 1404 // trigger applies. 1405 func (crt ChangeReplicasTrigger) Replicas() []ReplicaDescriptor { 1406 if crt.Desc != nil { 1407 return crt.Desc.Replicas().All() 1408 } 1409 return crt.DeprecatedUpdatedReplicas 1410 } 1411 1412 // NextReplicaID returns the next replica id to use after this trigger applies. 1413 func (crt ChangeReplicasTrigger) NextReplicaID() ReplicaID { 1414 if crt.Desc != nil { 1415 return crt.Desc.NextReplicaID 1416 } 1417 return crt.DeprecatedNextReplicaID 1418 } 1419 1420 // ConfChange returns the configuration change described by the trigger. 1421 func (crt ChangeReplicasTrigger) ConfChange(encodedCtx []byte) (raftpb.ConfChangeI, error) { 1422 return confChangeImpl(crt, encodedCtx) 1423 } 1424 1425 func (crt ChangeReplicasTrigger) alwaysV2() bool { 1426 // NB: we can return true in 20.1, but we don't win anything unless 1427 // we are actively trying to migrate out of V1 membership changes, which 1428 // could modestly simplify small areas of our codebase. 1429 return false 1430 } 1431 1432 // confChangeImpl is the implementation of (ChangeReplicasTrigger).ConfChange 1433 // narrowed down to the inputs it actually needs for better testability. 1434 func confChangeImpl( 1435 crt interface { 1436 Added() []ReplicaDescriptor 1437 Removed() []ReplicaDescriptor 1438 Replicas() []ReplicaDescriptor 1439 alwaysV2() bool 1440 }, 1441 encodedCtx []byte, 1442 ) (raftpb.ConfChangeI, error) { 1443 added, removed, replicas := crt.Added(), crt.Removed(), crt.Replicas() 1444 1445 var sl []raftpb.ConfChangeSingle 1446 1447 checkExists := func(in ReplicaDescriptor) error { 1448 for _, rDesc := range replicas { 1449 if rDesc.ReplicaID == in.ReplicaID { 1450 if a, b := in.GetType(), rDesc.GetType(); a != b { 1451 return errors.Errorf("have %s, but descriptor has %s", in, rDesc) 1452 } 1453 return nil 1454 } 1455 } 1456 return errors.Errorf("%s missing from descriptors %v", in, replicas) 1457 } 1458 checkNotExists := func(in ReplicaDescriptor) error { 1459 for _, rDesc := range replicas { 1460 if rDesc.ReplicaID == in.ReplicaID { 1461 return errors.Errorf("%s must no longer be present in descriptor", in) 1462 } 1463 } 1464 return nil 1465 } 1466 1467 for _, rDesc := range removed { 1468 sl = append(sl, raftpb.ConfChangeSingle{ 1469 Type: raftpb.ConfChangeRemoveNode, 1470 NodeID: uint64(rDesc.ReplicaID), 1471 }) 1472 1473 switch rDesc.GetType() { 1474 case VOTER_OUTGOING: 1475 // If a voter is removed through joint consensus, it will 1476 // be turned into an outgoing voter first. 1477 if err := checkExists(rDesc); err != nil { 1478 return nil, err 1479 } 1480 case VOTER_DEMOTING: 1481 // If a voter is demoted through joint consensus, it will 1482 // be turned into a demoting voter first. 1483 if err := checkExists(rDesc); err != nil { 1484 return nil, err 1485 } 1486 // It's being re-added as a learner, not only removed. 1487 sl = append(sl, raftpb.ConfChangeSingle{ 1488 Type: raftpb.ConfChangeAddLearnerNode, 1489 NodeID: uint64(rDesc.ReplicaID), 1490 }) 1491 case LEARNER: 1492 // A learner could in theory show up in the descriptor if the 1493 // removal was really a demotion and no joint consensus is used. 1494 // But etcd/raft currently forces us to go through joint consensus 1495 // when demoting, so demotions will always have a VOTER_DEMOTING 1496 // instead. We must be straight-up removing a voter or learner, so 1497 // the target should be gone from the descriptor at this point. 1498 if err := checkNotExists(rDesc); err != nil { 1499 return nil, err 1500 } 1501 case VOTER_FULL: 1502 // A voter can't be in the descriptor if it's being removed. 1503 if err := checkNotExists(rDesc); err != nil { 1504 return nil, err 1505 } 1506 default: 1507 return nil, errors.Errorf("can't remove replica in state %v", rDesc.GetType()) 1508 } 1509 } 1510 1511 for _, rDesc := range added { 1512 // The incoming descriptor must also be present in the set of all 1513 // replicas, which is ultimately the authoritative one because that's 1514 // what's written to the KV store. 1515 if err := checkExists(rDesc); err != nil { 1516 return nil, err 1517 } 1518 1519 var changeType raftpb.ConfChangeType 1520 switch rDesc.GetType() { 1521 case VOTER_FULL: 1522 // We're adding a new voter. 1523 changeType = raftpb.ConfChangeAddNode 1524 case VOTER_INCOMING: 1525 // We're adding a voter, but will transition into a joint config 1526 // first. 1527 changeType = raftpb.ConfChangeAddNode 1528 case LEARNER: 1529 // We're adding a learner. 1530 // Note that we're guaranteed by virtue of the upstream 1531 // ChangeReplicas txn that this learner is not currently a voter. 1532 // Demotions (i.e. transitioning from voter to learner) are not 1533 // represented in `added`; they're handled in `removed` above. 1534 changeType = raftpb.ConfChangeAddLearnerNode 1535 default: 1536 // A voter that is demoting was just removed and re-added in the 1537 // `removals` handler. We should not see it again here. 1538 // A voter that's outgoing similarly has no reason to show up here. 1539 return nil, errors.Errorf("can't add replica in state %v", rDesc.GetType()) 1540 } 1541 sl = append(sl, raftpb.ConfChangeSingle{ 1542 Type: changeType, 1543 NodeID: uint64(rDesc.ReplicaID), 1544 }) 1545 } 1546 1547 // Check whether we're entering a joint state. This is the case precisely when 1548 // the resulting descriptors tells us that this is the case. Note that we've 1549 // made sure above that all of the additions/removals are in tune with that 1550 // descriptor already. 1551 var enteringJoint bool 1552 for _, rDesc := range replicas { 1553 switch rDesc.GetType() { 1554 case VOTER_INCOMING, VOTER_OUTGOING, VOTER_DEMOTING: 1555 enteringJoint = true 1556 default: 1557 } 1558 } 1559 wantLeaveJoint := len(added)+len(removed) == 0 1560 if !enteringJoint { 1561 if len(added)+len(removed) > 1 { 1562 return nil, errors.Errorf("change requires joint consensus") 1563 } 1564 } else if wantLeaveJoint { 1565 return nil, errors.Errorf("descriptor enters joint state, but trigger is requesting to leave one") 1566 } 1567 1568 var cc raftpb.ConfChangeI 1569 1570 if enteringJoint || crt.alwaysV2() { 1571 // V2 membership changes, which allow atomic replication changes. We 1572 // track the joint state in the range descriptor and thus we need to be 1573 // in charge of when to leave the joint state. 1574 transition := raftpb.ConfChangeTransitionJointExplicit 1575 if !enteringJoint { 1576 // If we're using V2 just to avoid V1 (and not because we actually 1577 // have a change that requires V2), then use an auto transition 1578 // which skips the joint state. This is necessary: our descriptor 1579 // says we're not supposed to go through one. 1580 transition = raftpb.ConfChangeTransitionAuto 1581 } 1582 cc = raftpb.ConfChangeV2{ 1583 Transition: transition, 1584 Changes: sl, 1585 Context: encodedCtx, 1586 } 1587 } else if wantLeaveJoint { 1588 // Transitioning out of a joint config. 1589 cc = raftpb.ConfChangeV2{ 1590 Context: encodedCtx, 1591 } 1592 } else { 1593 // Legacy path with exactly one change. 1594 cc = raftpb.ConfChange{ 1595 Type: sl[0].Type, 1596 NodeID: sl[0].NodeID, 1597 Context: encodedCtx, 1598 } 1599 } 1600 return cc, nil 1601 } 1602 1603 var _ fmt.Stringer = &ChangeReplicasTrigger{} 1604 1605 func (crt ChangeReplicasTrigger) String() string { 1606 var nextReplicaID ReplicaID 1607 var afterReplicas []ReplicaDescriptor 1608 added, removed := crt.Added(), crt.Removed() 1609 if crt.Desc != nil { 1610 nextReplicaID = crt.Desc.NextReplicaID 1611 // NB: we don't want to mutate InternalReplicas, so we don't call 1612 // .Replicas() 1613 // 1614 // TODO(tbg): revisit after #39489 is merged. 1615 afterReplicas = crt.Desc.InternalReplicas 1616 } else { 1617 nextReplicaID = crt.DeprecatedNextReplicaID 1618 afterReplicas = crt.DeprecatedUpdatedReplicas 1619 } 1620 var chgS strings.Builder 1621 cc, err := crt.ConfChange(nil) 1622 if err != nil { 1623 fmt.Fprintf(&chgS, "<malformed ChangeReplicasTrigger: %s>", err) 1624 } else { 1625 ccv2 := cc.AsV2() 1626 if ccv2.LeaveJoint() { 1627 // NB: this isn't missing a trailing space. 1628 // 1629 // TODO(tbg): could list the replicas that will actually leave the 1630 // voter set. 1631 fmt.Fprintf(&chgS, "LEAVE_JOINT") 1632 } else if _, ok := ccv2.EnterJoint(); ok { 1633 fmt.Fprintf(&chgS, "ENTER_JOINT(%s) ", raftpb.ConfChangesToString(ccv2.Changes)) 1634 } else { 1635 fmt.Fprintf(&chgS, "SIMPLE(%s) ", raftpb.ConfChangesToString(ccv2.Changes)) 1636 } 1637 } 1638 if len(added) > 0 { 1639 fmt.Fprintf(&chgS, "%s%s", ADD_REPLICA, added) 1640 } 1641 if len(removed) > 0 { 1642 if len(added) > 0 { 1643 chgS.WriteString(", ") 1644 } 1645 fmt.Fprintf(&chgS, "%s%s", REMOVE_REPLICA, removed) 1646 } 1647 fmt.Fprintf(&chgS, ": after=%s next=%d", afterReplicas, nextReplicaID) 1648 return chgS.String() 1649 } 1650 1651 func (crt ChangeReplicasTrigger) legacy() (ReplicaDescriptor, bool) { 1652 if len(crt.InternalAddedReplicas)+len(crt.InternalRemovedReplicas) == 0 && crt.DeprecatedReplica.ReplicaID != 0 { 1653 return crt.DeprecatedReplica, true 1654 } 1655 return ReplicaDescriptor{}, false 1656 } 1657 1658 // Added returns the replicas added by this change (if there are any). 1659 func (crt ChangeReplicasTrigger) Added() []ReplicaDescriptor { 1660 if rDesc, ok := crt.legacy(); ok && crt.DeprecatedChangeType == ADD_REPLICA { 1661 return []ReplicaDescriptor{rDesc} 1662 } 1663 return crt.InternalAddedReplicas 1664 } 1665 1666 // Removed returns the replicas whose removal is initiated by this change (if there are any). 1667 // Note that in an atomic replication change, Removed() contains the replicas when they are 1668 // transitioning to VOTER_{OUTGOING,DEMOTING} (from VOTER_FULL). The subsequent trigger 1669 // leaving the joint configuration has an empty Removed(). 1670 func (crt ChangeReplicasTrigger) Removed() []ReplicaDescriptor { 1671 if rDesc, ok := crt.legacy(); ok && crt.DeprecatedChangeType == REMOVE_REPLICA { 1672 return []ReplicaDescriptor{rDesc} 1673 } 1674 return crt.InternalRemovedReplicas 1675 } 1676 1677 // LeaseSequence is a custom type for a lease sequence number. 1678 type LeaseSequence int64 1679 1680 // String implements the fmt.Stringer interface. 1681 func (s LeaseSequence) String() string { 1682 return strconv.FormatInt(int64(s), 10) 1683 } 1684 1685 var _ fmt.Stringer = &Lease{} 1686 1687 func (l Lease) String() string { 1688 var proposedSuffix string 1689 if l.ProposedTS != nil { 1690 proposedSuffix = fmt.Sprintf(" pro=%s", l.ProposedTS) 1691 } 1692 if l.Type() == LeaseExpiration { 1693 return fmt.Sprintf("repl=%s seq=%s start=%s exp=%s%s", l.Replica, l.Sequence, l.Start, l.Expiration, proposedSuffix) 1694 } 1695 return fmt.Sprintf("repl=%s seq=%s start=%s epo=%d%s", l.Replica, l.Sequence, l.Start, l.Epoch, proposedSuffix) 1696 } 1697 1698 // BootstrapLease returns the lease to persist for the range of a freshly bootstrapped store. The 1699 // returned lease is morally "empty" but has a few fields set to non-nil zero values because some 1700 // used to be non-nullable and we now fuzz their nullability in tests. As a consequence, it's better 1701 // to always use zero fields here so that the initial stats are constant. 1702 func BootstrapLease() Lease { 1703 return Lease{ 1704 Expiration: &hlc.Timestamp{}, 1705 DeprecatedStartStasis: &hlc.Timestamp{}, 1706 } 1707 } 1708 1709 // OwnedBy returns whether the given store is the lease owner. 1710 func (l Lease) OwnedBy(storeID StoreID) bool { 1711 return l.Replica.StoreID == storeID 1712 } 1713 1714 // LeaseType describes the type of lease. 1715 type LeaseType int 1716 1717 const ( 1718 // LeaseNone specifies no lease, to be used as a default value. 1719 LeaseNone LeaseType = iota 1720 // LeaseExpiration allows range operations while the wall clock is 1721 // within the expiration timestamp. 1722 LeaseExpiration 1723 // LeaseEpoch allows range operations while the node liveness epoch 1724 // is equal to the lease epoch. 1725 LeaseEpoch 1726 ) 1727 1728 // Type returns the lease type. 1729 func (l Lease) Type() LeaseType { 1730 if l.Epoch == 0 { 1731 return LeaseExpiration 1732 } 1733 return LeaseEpoch 1734 } 1735 1736 // Equivalent determines whether ol is considered the same lease 1737 // for the purposes of matching leases when executing a command. 1738 // For expiration-based leases, extensions are allowed. 1739 // Ignore proposed timestamps for lease verification; for epoch- 1740 // based leases, the start time of the lease is sufficient to 1741 // avoid using an older lease with same epoch. 1742 // 1743 // NB: Lease.Equivalent is NOT symmetric. For expiration-based 1744 // leases, a lease is equivalent to another with an equal or 1745 // later expiration, but not an earlier expiration. 1746 func (l Lease) Equivalent(newL Lease) bool { 1747 // Ignore proposed timestamp & deprecated start stasis. 1748 l.ProposedTS, newL.ProposedTS = nil, nil 1749 l.DeprecatedStartStasis, newL.DeprecatedStartStasis = nil, nil 1750 // Ignore sequence numbers, they are simply a reflection of 1751 // the equivalency of other fields. 1752 l.Sequence, newL.Sequence = 0, 0 1753 // Ignore the ReplicaDescriptor's type. This shouldn't affect lease 1754 // equivalency because Raft state shouldn't be factored into the state of a 1755 // Replica's lease. We don't expect a leaseholder to ever become a LEARNER 1756 // replica, but that also shouldn't prevent it from extending its lease. The 1757 // code also avoids a potential bug where an unset ReplicaType and a set 1758 // VOTER ReplicaType are considered distinct and non-equivalent. 1759 // 1760 // Change this line to the following when ReplicaType becomes non-nullable: 1761 // l.Replica.Type, newL.Replica.Type = 0, 0 1762 l.Replica.Type, newL.Replica.Type = nil, nil 1763 // If both leases are epoch-based, we must dereference the epochs 1764 // and then set to nil. 1765 switch l.Type() { 1766 case LeaseEpoch: 1767 // Ignore expirations. This seems benign but since we changed the 1768 // nullability of this field in the 1.2 cycle, it's crucial and 1769 // tested in TestLeaseEquivalence. 1770 l.Expiration, newL.Expiration = nil, nil 1771 1772 if l.Epoch == newL.Epoch { 1773 l.Epoch, newL.Epoch = 0, 0 1774 } 1775 case LeaseExpiration: 1776 // See the comment above, though this field's nullability wasn't 1777 // changed. We nil it out for completeness only. 1778 l.Epoch, newL.Epoch = 0, 0 1779 1780 // For expiration-based leases, extensions are considered equivalent. 1781 // This is the one case where Equivalent is not commutative and, as 1782 // such, requires special handling beneath Raft (see checkForcedErrLocked). 1783 if l.GetExpiration().LessEq(newL.GetExpiration()) { 1784 l.Expiration, newL.Expiration = nil, nil 1785 } 1786 } 1787 return l == newL 1788 } 1789 1790 // GetExpiration returns the lease expiration or the zero timestamp if the 1791 // receiver is not an expiration-based lease. 1792 func (l Lease) GetExpiration() hlc.Timestamp { 1793 if l.Expiration == nil { 1794 return hlc.Timestamp{} 1795 } 1796 return *l.Expiration 1797 } 1798 1799 // equivalentTimestamps compares two timestamps for equality and also considers 1800 // the nil timestamp equal to the zero timestamp. 1801 func equivalentTimestamps(a, b *hlc.Timestamp) bool { 1802 if a == nil { 1803 if b == nil { 1804 return true 1805 } 1806 if (*b == hlc.Timestamp{}) { 1807 return true 1808 } 1809 } else if b == nil { 1810 if (*a == hlc.Timestamp{}) { 1811 return true 1812 } 1813 } 1814 return a.Equal(b) 1815 } 1816 1817 // Equal implements the gogoproto Equal interface. This implementation is 1818 // forked from the gogoproto generated code to allow l.Expiration == nil and 1819 // l.Expiration == &hlc.Timestamp{} to compare equal. Ditto for 1820 // DeprecatedStartStasis. 1821 func (l *Lease) Equal(that interface{}) bool { 1822 if that == nil { 1823 return l == nil 1824 } 1825 1826 that1, ok := that.(*Lease) 1827 if !ok { 1828 that2, ok := that.(Lease) 1829 if ok { 1830 that1 = &that2 1831 } else { 1832 return false 1833 } 1834 } 1835 if that1 == nil { 1836 return l == nil 1837 } else if l == nil { 1838 return false 1839 } 1840 1841 if !l.Start.Equal(&that1.Start) { 1842 return false 1843 } 1844 if !equivalentTimestamps(l.Expiration, that1.Expiration) { 1845 return false 1846 } 1847 if !l.Replica.Equal(&that1.Replica) { 1848 return false 1849 } 1850 if !equivalentTimestamps(l.DeprecatedStartStasis, that1.DeprecatedStartStasis) { 1851 return false 1852 } 1853 if !l.ProposedTS.Equal(that1.ProposedTS) { 1854 return false 1855 } 1856 if l.Epoch != that1.Epoch { 1857 return false 1858 } 1859 if l.Sequence != that1.Sequence { 1860 return false 1861 } 1862 return true 1863 } 1864 1865 // MakeIntent makes an intent with the given txn and key. 1866 // This is suitable for use when constructing WriteIntentError. 1867 func MakeIntent(txn *enginepb.TxnMeta, key Key) Intent { 1868 var i Intent 1869 i.Key = key 1870 i.Txn = *txn 1871 return i 1872 } 1873 1874 // AsIntents takes a transaction and a slice of keys and 1875 // returns it as a slice of intents. 1876 func AsIntents(txn *enginepb.TxnMeta, keys []Key) []Intent { 1877 ret := make([]Intent, len(keys)) 1878 for i := range keys { 1879 ret[i] = MakeIntent(txn, keys[i]) 1880 } 1881 return ret 1882 } 1883 1884 // MakeLockAcquisition makes a lock acquisition message from the given 1885 // txn, key, and durability level. 1886 func MakeLockAcquisition(txn *Transaction, key Key, dur lock.Durability) LockAcquisition { 1887 return LockAcquisition{Span: Span{Key: key}, Txn: txn.TxnMeta, Durability: dur} 1888 } 1889 1890 // MakeLockUpdate makes a lock update from the given txn and span. 1891 func MakeLockUpdate(txn *Transaction, span Span) LockUpdate { 1892 u := LockUpdate{Span: span} 1893 u.SetTxn(txn) 1894 return u 1895 } 1896 1897 // AsLockUpdates takes a slice of spans and returns it as a slice of 1898 // lock updates. 1899 func AsLockUpdates(txn *Transaction, spans []Span) []LockUpdate { 1900 ret := make([]LockUpdate, len(spans)) 1901 for i := range spans { 1902 ret[i] = MakeLockUpdate(txn, spans[i]) 1903 } 1904 return ret 1905 } 1906 1907 // SetTxn updates the transaction details in the lock update. 1908 func (u *LockUpdate) SetTxn(txn *Transaction) { 1909 u.Txn = txn.TxnMeta 1910 u.Status = txn.Status 1911 u.IgnoredSeqNums = txn.IgnoredSeqNums 1912 } 1913 1914 // EqualValue compares for equality. 1915 func (s Span) EqualValue(o Span) bool { 1916 return s.Key.Equal(o.Key) && s.EndKey.Equal(o.EndKey) 1917 } 1918 1919 // Overlaps returns true WLOG for span A and B iff: 1920 // 1. Both spans contain one key (just the start key) and they are equal; or 1921 // 2. The span with only one key is contained inside the other span; or 1922 // 3. The end key of span A is strictly greater than the start key of span B 1923 // and the end key of span B is strictly greater than the start key of span 1924 // A. 1925 func (s Span) Overlaps(o Span) bool { 1926 if !s.Valid() || !o.Valid() { 1927 return false 1928 } 1929 1930 if len(s.EndKey) == 0 && len(o.EndKey) == 0 { 1931 return s.Key.Equal(o.Key) 1932 } else if len(s.EndKey) == 0 { 1933 return bytes.Compare(s.Key, o.Key) >= 0 && bytes.Compare(s.Key, o.EndKey) < 0 1934 } else if len(o.EndKey) == 0 { 1935 return bytes.Compare(o.Key, s.Key) >= 0 && bytes.Compare(o.Key, s.EndKey) < 0 1936 } 1937 return bytes.Compare(s.EndKey, o.Key) > 0 && bytes.Compare(s.Key, o.EndKey) < 0 1938 } 1939 1940 // Combine creates a new span containing the full union of the key 1941 // space covered by the two spans. This includes any key space not 1942 // covered by either span, but between them if the spans are disjoint. 1943 // Warning: using this method to combine local and non-local spans is 1944 // not recommended and will result in potentially database-wide 1945 // spans being returned. Use with caution. 1946 func (s Span) Combine(o Span) Span { 1947 if !s.Valid() || !o.Valid() { 1948 return Span{} 1949 } 1950 1951 min := s.Key 1952 max := s.Key 1953 if len(s.EndKey) > 0 { 1954 max = s.EndKey 1955 } 1956 if o.Key.Compare(min) < 0 { 1957 min = o.Key 1958 } else if o.Key.Compare(max) > 0 { 1959 max = o.Key 1960 } 1961 if len(o.EndKey) > 0 && o.EndKey.Compare(max) > 0 { 1962 max = o.EndKey 1963 } 1964 if min.Equal(max) { 1965 return Span{Key: min} 1966 } else if s.Key.Equal(max) || o.Key.Equal(max) { 1967 return Span{Key: min, EndKey: max.Next()} 1968 } 1969 return Span{Key: min, EndKey: max} 1970 } 1971 1972 // Contains returns whether the receiver contains the given span. 1973 func (s Span) Contains(o Span) bool { 1974 if !s.Valid() || !o.Valid() { 1975 return false 1976 } 1977 1978 if len(s.EndKey) == 0 && len(o.EndKey) == 0 { 1979 return s.Key.Equal(o.Key) 1980 } else if len(s.EndKey) == 0 { 1981 return false 1982 } else if len(o.EndKey) == 0 { 1983 return bytes.Compare(o.Key, s.Key) >= 0 && bytes.Compare(o.Key, s.EndKey) < 0 1984 } 1985 return bytes.Compare(s.Key, o.Key) <= 0 && bytes.Compare(s.EndKey, o.EndKey) >= 0 1986 } 1987 1988 // ContainsKey returns whether the span contains the given key. 1989 func (s Span) ContainsKey(key Key) bool { 1990 return bytes.Compare(key, s.Key) >= 0 && bytes.Compare(key, s.EndKey) < 0 1991 } 1992 1993 // ProperlyContainsKey returns whether the span properly contains the given key. 1994 func (s Span) ProperlyContainsKey(key Key) bool { 1995 return bytes.Compare(key, s.Key) > 0 && bytes.Compare(key, s.EndKey) < 0 1996 } 1997 1998 // AsRange returns the Span as an interval.Range. 1999 func (s Span) AsRange() interval.Range { 2000 startKey := s.Key 2001 endKey := s.EndKey 2002 if len(endKey) == 0 { 2003 endKey = s.Key.Next() 2004 startKey = endKey[:len(startKey)] 2005 } 2006 return interval.Range{ 2007 Start: interval.Comparable(startKey), 2008 End: interval.Comparable(endKey), 2009 } 2010 } 2011 2012 func (s Span) String() string { 2013 const maxChars = math.MaxInt32 2014 return PrettyPrintRange(s.Key, s.EndKey, maxChars) 2015 } 2016 2017 // SplitOnKey returns two spans where the left span has EndKey and right span 2018 // has start Key of the split key, respectively. 2019 // If the split key lies outside the span, the original span is returned on the 2020 // left (and right is an invalid span with empty keys). 2021 func (s Span) SplitOnKey(key Key) (left Span, right Span) { 2022 // Cannot split on or before start key or on or after end key. 2023 if bytes.Compare(key, s.Key) <= 0 || bytes.Compare(key, s.EndKey) >= 0 { 2024 return s, Span{} 2025 } 2026 2027 return Span{Key: s.Key, EndKey: key}, Span{Key: key, EndKey: s.EndKey} 2028 } 2029 2030 // Valid returns whether or not the span is a "valid span". 2031 // A valid span cannot have an empty start and end key and must satisfy either: 2032 // 1. The end key is empty. 2033 // 2. The start key is lexicographically-ordered before the end key. 2034 func (s Span) Valid() bool { 2035 // s.Key can be empty if it is KeyMin. 2036 // Can't have both KeyMin start and end keys. 2037 if len(s.Key) == 0 && len(s.EndKey) == 0 { 2038 return false 2039 } 2040 2041 if len(s.EndKey) == 0 { 2042 return true 2043 } 2044 2045 if bytes.Compare(s.Key, s.EndKey) >= 0 { 2046 return false 2047 } 2048 2049 return true 2050 } 2051 2052 // Spans is a slice of spans. 2053 type Spans []Span 2054 2055 // implement Sort.Interface 2056 func (a Spans) Len() int { return len(a) } 2057 func (a Spans) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 2058 func (a Spans) Less(i, j int) bool { return a[i].Key.Compare(a[j].Key) < 0 } 2059 2060 // ContainsKey returns whether any of the spans in the set of spans contains 2061 // the given key. 2062 func (a Spans) ContainsKey(key Key) bool { 2063 for _, span := range a { 2064 if span.ContainsKey(key) { 2065 return true 2066 } 2067 } 2068 2069 return false 2070 } 2071 2072 // RSpan is a key range with an inclusive start RKey and an exclusive end RKey. 2073 type RSpan struct { 2074 Key, EndKey RKey 2075 } 2076 2077 // Equal compares for equality. 2078 func (rs RSpan) Equal(o RSpan) bool { 2079 return rs.Key.Equal(o.Key) && rs.EndKey.Equal(o.EndKey) 2080 } 2081 2082 // ContainsKey returns whether this span contains the specified key. 2083 func (rs RSpan) ContainsKey(key RKey) bool { 2084 return bytes.Compare(key, rs.Key) >= 0 && bytes.Compare(key, rs.EndKey) < 0 2085 } 2086 2087 // ContainsKeyInverted returns whether this span contains the specified key. The 2088 // receiver span is considered inverted, meaning that instead of containing the 2089 // range ["key","endKey"), it contains the range ("key","endKey"]. 2090 func (rs RSpan) ContainsKeyInverted(key RKey) bool { 2091 return bytes.Compare(key, rs.Key) > 0 && bytes.Compare(key, rs.EndKey) <= 0 2092 } 2093 2094 // ContainsKeyRange returns whether this span contains the specified 2095 // key range from start (inclusive) to end (exclusive). 2096 // If end is empty or start is equal to end, returns ContainsKey(start). 2097 func (rs RSpan) ContainsKeyRange(start, end RKey) bool { 2098 if len(end) == 0 { 2099 return rs.ContainsKey(start) 2100 } 2101 if comp := bytes.Compare(end, start); comp < 0 { 2102 return false 2103 } else if comp == 0 { 2104 return rs.ContainsKey(start) 2105 } 2106 return bytes.Compare(start, rs.Key) >= 0 && bytes.Compare(rs.EndKey, end) >= 0 2107 } 2108 2109 func (rs RSpan) String() string { 2110 const maxChars = math.MaxInt32 2111 return PrettyPrintRange(Key(rs.Key), Key(rs.EndKey), maxChars) 2112 } 2113 2114 // Intersect returns the intersection of the current span and the 2115 // descriptor's range. Returns an error if the span and the 2116 // descriptor's range do not overlap. 2117 func (rs RSpan) Intersect(desc *RangeDescriptor) (RSpan, error) { 2118 if !rs.Key.Less(desc.EndKey) || !desc.StartKey.Less(rs.EndKey) { 2119 return rs, errors.Errorf("span and descriptor's range do not overlap: %s vs %s", rs, desc) 2120 } 2121 2122 key := rs.Key 2123 if key.Less(desc.StartKey) { 2124 key = desc.StartKey 2125 } 2126 endKey := rs.EndKey 2127 if !desc.ContainsKeyRange(desc.StartKey, endKey) { 2128 endKey = desc.EndKey 2129 } 2130 return RSpan{key, endKey}, nil 2131 } 2132 2133 // AsRawSpanWithNoLocals returns the RSpan as a Span. This is to be used only 2134 // in select situations in which an RSpan is known to not contain a wrapped 2135 // locally-addressed Span. 2136 func (rs RSpan) AsRawSpanWithNoLocals() Span { 2137 return Span{ 2138 Key: Key(rs.Key), 2139 EndKey: Key(rs.EndKey), 2140 } 2141 } 2142 2143 // KeyValueByKey implements sorting of a slice of KeyValues by key. 2144 type KeyValueByKey []KeyValue 2145 2146 // Len implements sort.Interface. 2147 func (kv KeyValueByKey) Len() int { 2148 return len(kv) 2149 } 2150 2151 // Less implements sort.Interface. 2152 func (kv KeyValueByKey) Less(i, j int) bool { 2153 return bytes.Compare(kv[i].Key, kv[j].Key) < 0 2154 } 2155 2156 // Swap implements sort.Interface. 2157 func (kv KeyValueByKey) Swap(i, j int) { 2158 kv[i], kv[j] = kv[j], kv[i] 2159 } 2160 2161 var _ sort.Interface = KeyValueByKey{} 2162 2163 // observedTimestampSlice maintains an immutable sorted list of observed 2164 // timestamps. 2165 type observedTimestampSlice []ObservedTimestamp 2166 2167 func (s observedTimestampSlice) index(nodeID NodeID) int { 2168 return sort.Search(len(s), 2169 func(i int) bool { 2170 return s[i].NodeID >= nodeID 2171 }, 2172 ) 2173 } 2174 2175 // get the observed timestamp for the specified node, returning false if no 2176 // timestamp exists. 2177 func (s observedTimestampSlice) get(nodeID NodeID) (hlc.Timestamp, bool) { 2178 i := s.index(nodeID) 2179 if i < len(s) && s[i].NodeID == nodeID { 2180 return s[i].Timestamp, true 2181 } 2182 return hlc.Timestamp{}, false 2183 } 2184 2185 // update the timestamp for the specified node, or add a new entry in the 2186 // correct (sorted) location. The receiver is not mutated. 2187 func (s observedTimestampSlice) update( 2188 nodeID NodeID, timestamp hlc.Timestamp, 2189 ) observedTimestampSlice { 2190 i := s.index(nodeID) 2191 if i < len(s) && s[i].NodeID == nodeID { 2192 if timestamp.Less(s[i].Timestamp) { 2193 // The input slice is immutable, so copy and update. 2194 cpy := make(observedTimestampSlice, len(s)) 2195 copy(cpy, s) 2196 cpy[i].Timestamp = timestamp 2197 return cpy 2198 } 2199 return s 2200 } 2201 // The input slice is immutable, so copy and update. Don't append to 2202 // avoid an allocation. Doing so could invalidate a previous update 2203 // to this receiver. 2204 cpy := make(observedTimestampSlice, len(s)+1) 2205 copy(cpy[:i], s[:i]) 2206 cpy[i] = ObservedTimestamp{NodeID: nodeID, Timestamp: timestamp} 2207 copy(cpy[i+1:], s[i:]) 2208 return cpy 2209 } 2210 2211 // SequencedWriteBySeq implements sorting of a slice of SequencedWrites 2212 // by sequence number. 2213 type SequencedWriteBySeq []SequencedWrite 2214 2215 // Len implements sort.Interface. 2216 func (s SequencedWriteBySeq) Len() int { return len(s) } 2217 2218 // Less implements sort.Interface. 2219 func (s SequencedWriteBySeq) Less(i, j int) bool { return s[i].Sequence < s[j].Sequence } 2220 2221 // Swap implements sort.Interface. 2222 func (s SequencedWriteBySeq) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 2223 2224 var _ sort.Interface = SequencedWriteBySeq{} 2225 2226 // Find searches for the index of the SequencedWrite with the provided 2227 // sequence number. Returns -1 if no corresponding write is found. 2228 func (s SequencedWriteBySeq) Find(seq enginepb.TxnSeq) int { 2229 if util.RaceEnabled { 2230 if !sort.IsSorted(s) { 2231 panic("SequencedWriteBySeq must be sorted") 2232 } 2233 } 2234 if i := sort.Search(len(s), func(i int) bool { 2235 return s[i].Sequence >= seq 2236 }); i < len(s) && s[i].Sequence == seq { 2237 return i 2238 } 2239 return -1 2240 } 2241 2242 // Silence unused warning. 2243 var _ = (SequencedWriteBySeq{}).Find 2244 2245 func init() { 2246 // Inject the format dependency into the enginepb package. 2247 enginepb.FormatBytesAsKey = func(k []byte) string { return Key(k).String() } 2248 enginepb.FormatBytesAsValue = func(v []byte) string { return Value{RawBytes: v}.PrettyPrint() } 2249 }