github.com/cilium/statedb@v0.3.2/txn.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package statedb 5 6 import ( 7 "bufio" 8 "encoding/binary" 9 "encoding/json" 10 "fmt" 11 "io" 12 "reflect" 13 "runtime" 14 "slices" 15 "sync/atomic" 16 "time" 17 18 "github.com/cilium/statedb/index" 19 "github.com/cilium/statedb/internal" 20 "github.com/cilium/statedb/part" 21 ) 22 23 type txn struct { 24 db *DB 25 root dbRoot 26 27 handle string 28 acquiredAt time.Time // the time at which the transaction acquired the locks 29 duration atomic.Uint64 // the transaction duration after it finished 30 writeTxn 31 } 32 33 type writeTxn struct { 34 modifiedTables []*tableEntry // table entries being modified 35 smus internal.SortableMutexes // the (sorted) table locks 36 tableNames []string 37 } 38 39 type indexReadTxn struct { 40 part.Ops[object] 41 unique bool 42 } 43 44 type indexTxn struct { 45 *part.Txn[object] 46 unique bool 47 } 48 49 var zeroTxn = txn{} 50 51 // txn fulfills the ReadTxn/WriteTxn interface. 52 func (txn *txn) getTxn() *txn { 53 return txn 54 } 55 56 // acquiredInfo returns the information for the "Last WriteTxn" column 57 // in "db tables" command. The correctness of this relies on the following assumptions: 58 // - txn.handle and txn.acquiredAt are not modified 59 // - txn.duration is atomically updated on Commit or Abort 60 func (txn *txn) acquiredInfo() string { 61 if txn == nil { 62 return "" 63 } 64 since := internal.PrettySince(txn.acquiredAt) 65 dur := time.Duration(txn.duration.Load()) 66 if txn.duration.Load() == 0 { 67 // Still locked 68 return fmt.Sprintf("%s (locked for %s)", txn.handle, since) 69 } 70 return fmt.Sprintf("%s (%s ago, locked for %s)", txn.handle, since, internal.PrettyDuration(dur)) 71 } 72 73 // txnFinalizer is called when the GC frees *txn. It checks that a WriteTxn 74 // has been Aborted or Committed. This is a safeguard against forgetting to 75 // Abort/Commit which would cause the table to be locked forever. 76 func txnFinalizer(txn *txn) { 77 if txn.modifiedTables != nil { 78 panic(fmt.Sprintf("WriteTxn from handle %s against tables %v was never Abort()'d or Commit()'d", txn.handle, txn.tableNames)) 79 } 80 } 81 82 func (txn *txn) getTableEntry(meta TableMeta) *tableEntry { 83 if txn.modifiedTables != nil { 84 entry := txn.modifiedTables[meta.tablePos()] 85 if entry != nil { 86 return entry 87 } 88 } 89 return &txn.root[meta.tablePos()] 90 } 91 92 // indexReadTxn returns a transaction to read from the specific index. 93 // If the table or index is not found this returns nil & error. 94 func (txn *txn) indexReadTxn(meta TableMeta, indexPos int) (indexReadTxn, error) { 95 if meta.tablePos() < 0 { 96 return indexReadTxn{}, tableError(meta.Name(), ErrTableNotRegistered) 97 } 98 if txn.modifiedTables != nil { 99 entry := txn.modifiedTables[meta.tablePos()] 100 if entry != nil { 101 itxn, err := txn.indexWriteTxn(meta, indexPos) 102 if err != nil { 103 return indexReadTxn{}, err 104 } 105 // Since iradix reuses nodes when mutating we need to return a clone 106 // so that iterators don't become invalid. 107 return indexReadTxn{itxn.Txn.Clone(), itxn.unique}, nil 108 } 109 } 110 indexEntry := txn.root[meta.tablePos()].indexes[indexPos] 111 return indexReadTxn{indexEntry.tree, indexEntry.unique}, nil 112 } 113 114 // indexWriteTxn returns a transaction to read/write to a specific index. 115 // The created transaction is memoized and used for subsequent reads and/or writes. 116 func (txn *txn) indexWriteTxn(meta TableMeta, indexPos int) (indexTxn, error) { 117 table := txn.modifiedTables[meta.tablePos()] 118 if table == nil { 119 return indexTxn{}, tableError(meta.Name(), ErrTableNotLockedForWriting) 120 } 121 indexEntry := &table.indexes[indexPos] 122 if indexEntry.txn == nil { 123 indexEntry.txn = indexEntry.tree.Txn() 124 } 125 return indexTxn{indexEntry.txn, indexEntry.unique}, nil 126 } 127 128 // mustIndexReadTxn returns a transaction to read from the specific index. 129 // Panics if table or index are not found. 130 func (txn *txn) mustIndexReadTxn(meta TableMeta, indexPos int) indexReadTxn { 131 indexTxn, err := txn.indexReadTxn(meta, indexPos) 132 if err != nil { 133 panic(err) 134 } 135 return indexTxn 136 } 137 138 // mustIndexReadTxn returns a transaction to read or write from the specific index. 139 // Panics if table or index not found. 140 func (txn *txn) mustIndexWriteTxn(meta TableMeta, indexPos int) indexTxn { 141 indexTxn, err := txn.indexWriteTxn(meta, indexPos) 142 if err != nil { 143 panic(err) 144 } 145 return indexTxn 146 } 147 148 func (txn *txn) insert(meta TableMeta, guardRevision Revision, data any) (object, bool, error) { 149 return txn.modify(meta, guardRevision, data, func(_ any) any { return data }) 150 } 151 152 func (txn *txn) modify(meta TableMeta, guardRevision Revision, newData any, merge func(any) any) (object, bool, error) { 153 if txn.modifiedTables == nil { 154 return object{}, false, ErrTransactionClosed 155 } 156 157 // Look up table and allocate a new revision. 158 tableName := meta.Name() 159 table := txn.modifiedTables[meta.tablePos()] 160 if table == nil { 161 return object{}, false, tableError(tableName, ErrTableNotLockedForWriting) 162 } 163 oldRevision := table.revision 164 table.revision++ 165 revision := table.revision 166 167 // Update the primary index first 168 idKey := meta.primary().fromObject(object{data: newData}).First() 169 idIndexTxn := txn.mustIndexWriteTxn(meta, PrimaryIndexPos) 170 171 var obj object 172 oldObj, oldExists := idIndexTxn.Modify(idKey, 173 func(old object) object { 174 obj = object{ 175 revision: revision, 176 } 177 if old.revision == 0 { 178 // Zero revision: the object did not exist so no need to call merge. 179 obj.data = newData 180 } else { 181 obj.data = merge(old.data) 182 } 183 return obj 184 }) 185 186 // Sanity check: is the same object being inserted back and thus the 187 // immutable object is being mutated? 188 if oldExists { 189 val := reflect.ValueOf(obj.data) 190 if val.Kind() == reflect.Pointer { 191 oldVal := reflect.ValueOf(oldObj.data) 192 if val.UnsafePointer() == oldVal.UnsafePointer() { 193 panic(fmt.Sprintf( 194 "Insert() of the same object (%T) back into the table. Is the immutable object being mutated?", 195 obj.data)) 196 } 197 } 198 } 199 200 // For CompareAndSwap() validate against the given guard revision 201 if guardRevision > 0 { 202 if !oldExists { 203 // CompareAndSwap requires the object to exist. Revert 204 // the insert. 205 idIndexTxn.Delete(idKey) 206 table.revision = oldRevision 207 return object{}, false, ErrObjectNotFound 208 } 209 if oldObj.revision != guardRevision { 210 // Revert the change. We're assuming here that it's rarer for CompareAndSwap() to 211 // fail and thus we're optimizing to have only one lookup in the common case 212 // (versus doing a Get() and then Insert()). 213 idIndexTxn.Insert(idKey, oldObj) 214 table.revision = oldRevision 215 return oldObj, true, ErrRevisionNotEqual 216 } 217 } 218 219 // Update revision index 220 revIndexTxn := txn.mustIndexWriteTxn(meta, RevisionIndexPos) 221 if oldExists { 222 var revKey [8]byte // to avoid heap allocation 223 binary.BigEndian.PutUint64(revKey[:], oldObj.revision) 224 _, ok := revIndexTxn.Delete(revKey[:]) 225 if !ok { 226 panic("BUG: Old revision index entry not found") 227 } 228 } 229 revIndexTxn.Insert(index.Uint64(obj.revision), obj) 230 231 // If it's new, possibly remove an older deleted object with the same 232 // primary key from the graveyard. 233 if !oldExists { 234 if old, existed := txn.mustIndexWriteTxn(meta, GraveyardIndexPos).Delete(idKey); existed { 235 var revKey [8]byte // to avoid heap allocation 236 binary.BigEndian.PutUint64(revKey[:], old.revision) 237 txn.mustIndexWriteTxn(meta, GraveyardRevisionIndexPos).Delete(revKey[:]) 238 } 239 } 240 241 // Then update secondary indexes 242 for _, indexer := range meta.secondary() { 243 indexTxn := txn.mustIndexWriteTxn(meta, indexer.pos) 244 newKeys := indexer.fromObject(obj) 245 246 if oldExists { 247 // If the object already existed it might've invalidated the 248 // non-primary indexes. Compute the old key for this index and 249 // if the new key is different delete the old entry. 250 indexer.fromObject(oldObj).Foreach(func(oldKey index.Key) { 251 if !indexer.unique { 252 oldKey = encodeNonUniqueKey(idKey, oldKey) 253 } 254 if !newKeys.Exists(oldKey) { 255 indexTxn.Delete(oldKey) 256 } 257 }) 258 } 259 newKeys.Foreach(func(newKey index.Key) { 260 // Non-unique secondary indexes are formed by concatenating them 261 // with the primary key. 262 if !indexer.unique { 263 newKey = encodeNonUniqueKey(idKey, newKey) 264 } 265 indexTxn.Insert(newKey, obj) 266 }) 267 } 268 269 return oldObj, oldExists, nil 270 } 271 272 func (txn *txn) hasDeleteTrackers(meta TableMeta) bool { 273 table := txn.modifiedTables[meta.tablePos()] 274 if table != nil { 275 return table.deleteTrackers.Len() > 0 276 } 277 return txn.root[meta.tablePos()].deleteTrackers.Len() > 0 278 } 279 280 func (txn *txn) addDeleteTracker(meta TableMeta, trackerName string, dt anyDeleteTracker) error { 281 if txn.modifiedTables == nil { 282 return ErrTransactionClosed 283 } 284 table := txn.modifiedTables[meta.tablePos()] 285 if table == nil { 286 return tableError(meta.Name(), ErrTableNotLockedForWriting) 287 } 288 289 _, _, table.deleteTrackers = table.deleteTrackers.Insert([]byte(trackerName), dt) 290 txn.db.metrics.DeleteTrackerCount(meta.Name(), table.deleteTrackers.Len()) 291 292 return nil 293 } 294 295 func (txn *txn) delete(meta TableMeta, guardRevision Revision, data any) (object, bool, error) { 296 if txn.modifiedTables == nil { 297 return object{}, false, ErrTransactionClosed 298 } 299 300 // Look up table and allocate a new revision. 301 tableName := meta.Name() 302 table := txn.modifiedTables[meta.tablePos()] 303 if table == nil { 304 return object{}, false, tableError(tableName, ErrTableNotLockedForWriting) 305 } 306 oldRevision := table.revision 307 table.revision++ 308 revision := table.revision 309 310 // Delete from the primary index first to grab the object. 311 // We assume that "data" has only enough defined fields to 312 // compute the primary key. 313 idKey := meta.primary().fromObject(object{data: data}).First() 314 idIndexTree := txn.mustIndexWriteTxn(meta, PrimaryIndexPos) 315 obj, existed := idIndexTree.Delete(idKey) 316 if !existed { 317 return object{}, false, nil 318 } 319 320 // For CompareAndDelete() validate against guard revision and if there's a mismatch, 321 // revert the change. 322 if guardRevision > 0 { 323 if obj.revision != guardRevision { 324 idIndexTree.Insert(idKey, obj) 325 table.revision = oldRevision 326 return obj, true, ErrRevisionNotEqual 327 } 328 } 329 330 // Remove the object from the revision index. 331 indexTree := txn.mustIndexWriteTxn(meta, RevisionIndexPos) 332 var revKey [8]byte // To avoid heap allocation 333 binary.BigEndian.PutUint64(revKey[:], obj.revision) 334 if _, ok := indexTree.Delete(revKey[:]); !ok { 335 txn.Abort() 336 panic("BUG: Object to be deleted not found from revision index") 337 } 338 339 // Then update secondary indexes. 340 for _, indexer := range meta.secondary() { 341 indexer.fromObject(obj).Foreach(func(key index.Key) { 342 if !indexer.unique { 343 key = encodeNonUniqueKey(idKey, key) 344 } 345 txn.mustIndexWriteTxn(meta, indexer.pos).Delete(key) 346 }) 347 } 348 349 // And finally insert the object into the graveyard. 350 if txn.hasDeleteTrackers(meta) { 351 graveyardIndex := txn.mustIndexWriteTxn(meta, GraveyardIndexPos) 352 obj.revision = revision 353 if _, existed := graveyardIndex.Insert(idKey, obj); existed { 354 txn.Abort() 355 panic("BUG: Double deletion! Deleted object already existed in graveyard") 356 } 357 txn.mustIndexWriteTxn(meta, GraveyardRevisionIndexPos).Insert(index.Uint64(revision), obj) 358 } 359 360 return obj, true, nil 361 } 362 363 const ( 364 nonUniqueSeparator = 0x0 365 nonUniqueSubstitute = 0xfe 366 nonUniqueSubstitute2 = 0xfd 367 ) 368 369 // appendEncodePrimary encodes the 'src' (primary key) into 'dst'. 370 func appendEncodePrimary(dst, src []byte) []byte { 371 for _, b := range src { 372 switch b { 373 case nonUniqueSeparator: 374 dst = append(dst, nonUniqueSubstitute) 375 case nonUniqueSubstitute: 376 dst = append(dst, nonUniqueSubstitute2, 0x00) 377 case nonUniqueSubstitute2: 378 dst = append(dst, nonUniqueSubstitute2, 0x01) 379 default: 380 dst = append(dst, b) 381 } 382 } 383 return dst 384 } 385 386 // encodeNonUniqueKey constructs the internal key to use with non-unique indexes. 387 // The key is constructed by concatenating the secondary key with the primary key 388 // along with the secondary key length. The secondary and primary key are separated 389 // with by a 0x0 to ensure ordering is defined by the secondary key. To make sure the 390 // separator does not appear in the primary key it is encoded using this schema: 391 // 392 // 0x0 => 0xfe, 0xfe => 0xfd00, 0xfd => 0xfd01 393 // 394 // The schema tries to avoid expansion for encoded small integers, e.g. 0x0000 becomes 0xfefe. 395 // The length at the end is encoded as unsigned 16-bit big endian. 396 // 397 // This schema allows looking up from the non-unique index with the secondary key by 398 // doing a prefix search. The length is used to safe-guard against indexers that don't 399 // terminate the key properly (e.g. if secondary key is "foo", then we don't want 400 // "foobar" to match). 401 func encodeNonUniqueKey(primary, secondary index.Key) []byte { 402 key := make([]byte, 0, 403 len(secondary)+1 /* separator */ + 404 len(primary)+ 405 2 /* space for few substitutions */ + 406 2 /* length */) 407 key = append(key, secondary...) 408 key = append(key, nonUniqueSeparator) 409 key = appendEncodePrimary(key, primary) 410 // KeySet limits size of key to 16 bits. 411 return binary.BigEndian.AppendUint16(key, uint16(len(secondary))) 412 } 413 414 func decodeNonUniqueKey(key []byte) (secondary []byte, encPrimary []byte) { 415 // Non-unique key is [<secondary...>, '\xfe', <encoded primary...>, <secondary length>] 416 if len(key) < 2 { 417 return nil, nil 418 } 419 secondaryLength := int(binary.BigEndian.Uint16(key[len(key)-2:])) 420 if len(key) < secondaryLength { 421 return nil, nil 422 } 423 return key[:secondaryLength], key[secondaryLength+1 : len(key)-2] 424 } 425 426 func (txn *txn) Abort() { 427 runtime.SetFinalizer(txn, nil) 428 429 // If modifiedTables is nil, this transaction has already been committed or aborted, and 430 // thus there is nothing to do. We allow this without failure to allow for defer 431 // pattern: 432 // 433 // txn := db.WriteTxn(...) 434 // defer txn.Abort() 435 // 436 // ... 437 // if err != nil { 438 // // Transaction now aborted. 439 // return err 440 // } 441 // 442 // txn.Commit() 443 // 444 if txn.modifiedTables == nil { 445 return 446 } 447 448 txn.duration.Store(uint64(time.Since(txn.acquiredAt))) 449 450 txn.smus.Unlock() 451 txn.db.metrics.WriteTxnDuration( 452 txn.handle, 453 txn.tableNames, 454 time.Since(txn.acquiredAt)) 455 456 txn.writeTxn = writeTxn{} 457 } 458 459 // Commit the transaction. Returns a ReadTxn that is the snapshot of the database at the 460 // point of commit. 461 func (txn *txn) Commit() ReadTxn { 462 runtime.SetFinalizer(txn, nil) 463 464 // We operate here under the following properties: 465 // 466 // - Each table that we're modifying has its SortableMutex locked and held by 467 // the caller (via WriteTxn()). Concurrent updates to other tables are 468 // allowed (but not to the root pointer), and thus there may be multiple parallel 469 // Commit()'s in progress, but each of those will only process work for tables 470 // they have locked, until root is to be updated. 471 // 472 // - Modifications to the root pointer (db.root) are made with the db.mu acquired, 473 // and thus changes to it are always performed sequentially. The root pointer is 474 // updated atomically, and thus readers see either an old root or a new root. 475 // Both the old root and new root are immutable after they're made available via 476 // the root pointer. 477 // 478 // - As the root is atomically swapped to a new immutable tree of tables of indexes, 479 // a reader can acquire an immutable snapshot of all data in the database with a 480 // simpler atomic pointer load. 481 482 // If db is nil, this transaction has already been committed or aborted, and 483 // thus there is nothing to do. 484 if txn.db == nil { 485 return nil 486 } 487 488 txn.duration.Store(uint64(time.Since(txn.acquiredAt))) 489 490 db := txn.db 491 492 // Commit each individual changed index to each table. 493 // We don't notify yet (CommitOnly) as the root needs to be updated 494 // first as otherwise readers would wake up too early. 495 txnToNotify := []*part.Txn[object]{} 496 for _, table := range txn.modifiedTables { 497 if table == nil { 498 continue 499 } 500 for i := range table.indexes { 501 txn := table.indexes[i].txn 502 if txn != nil { 503 table.indexes[i].tree = txn.CommitOnly() 504 table.indexes[i].txn = nil 505 txnToNotify = append(txnToNotify, txn) 506 } 507 } 508 509 // Update metrics 510 name := table.meta.Name() 511 db.metrics.GraveyardObjectCount(name, table.numDeletedObjects()) 512 db.metrics.ObjectCount(name, table.numObjects()) 513 db.metrics.Revision(name, table.revision) 514 } 515 516 // Acquire the lock on the root tree to sequence the updates to it. We can acquire 517 // it after we've built up the new table entries above, since changes to those were 518 // protected by each table lock (that we're holding here). 519 db.mu.Lock() 520 521 // Since the root may have changed since the pointer was last read in WriteTxn(), 522 // load it again and modify the latest version that we now have immobilised by 523 // the root lock. 524 root := *db.root.Load() 525 root = slices.Clone(root) 526 527 var initChansToClose []chan struct{} 528 529 // Insert the modified tables into the root tree of tables. 530 for pos, table := range txn.modifiedTables { 531 if table != nil { 532 // Check if tables become initialized. We close the channel only after 533 // we've swapped in the new root so that one cannot get a snapshot of 534 // an uninitialized table after observing the channel closing. 535 if !table.initialized && len(table.pendingInitializers) == 0 { 536 initChansToClose = append(initChansToClose, table.initWatchChan) 537 table.initialized = true 538 } 539 root[pos] = *table 540 } 541 } 542 543 // Commit the transaction to build the new root tree and then 544 // atomically store it. 545 txn.root = root 546 db.root.Store(&root) 547 db.mu.Unlock() 548 549 // With the root pointer updated, we can now release the tables for the next write transaction. 550 txn.smus.Unlock() 551 552 // Now that new root is committed, we can notify readers by closing the watch channels of 553 // mutated radix tree nodes in all changed indexes and on the root itself. 554 for _, txn := range txnToNotify { 555 txn.Notify() 556 } 557 558 // Notify table initializations 559 for _, ch := range initChansToClose { 560 close(ch) 561 } 562 563 txn.db.metrics.WriteTxnDuration( 564 txn.handle, 565 txn.tableNames, 566 time.Since(txn.acquiredAt)) 567 568 // Convert into a ReadTxn 569 txn.writeTxn = writeTxn{} 570 return txn 571 } 572 573 func writeTableAsJSON(buf *bufio.Writer, txn *txn, table *tableEntry) error { 574 indexTxn := txn.mustIndexReadTxn(table.meta, PrimaryIndexPos) 575 iter := indexTxn.Iterator() 576 577 buf.WriteString(" \"" + table.meta.Name() + "\": [\n") 578 579 _, obj, ok := iter.Next() 580 for ok { 581 buf.WriteString(" ") 582 bs, err := json.Marshal(obj.data) 583 if err != nil { 584 return err 585 } 586 buf.Write(bs) 587 _, obj, ok = iter.Next() 588 if ok { 589 buf.WriteString(",\n") 590 } else { 591 buf.WriteByte('\n') 592 } 593 } 594 buf.WriteString(" ]") 595 return nil 596 } 597 598 // WriteJSON marshals out the database as JSON into the given writer. 599 // If tables are given then only these tables are written. 600 func (txn *txn) WriteJSON(w io.Writer, tables ...string) error { 601 buf := bufio.NewWriter(w) 602 buf.WriteString("{\n") 603 first := true 604 605 for _, table := range txn.root { 606 if len(tables) > 0 && !slices.Contains(tables, table.meta.Name()) { 607 continue 608 } 609 610 if !first { 611 buf.WriteString(",\n") 612 } else { 613 first = false 614 } 615 616 err := writeTableAsJSON(buf, txn, &table) 617 if err != nil { 618 return err 619 } 620 } 621 buf.WriteString("\n}\n") 622 return buf.Flush() 623 }