go.etcd.io/etcd@v3.3.27+incompatible/mvcc/backend/backend.go (about) 1 // Copyright 2015 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package backend 16 17 import ( 18 "fmt" 19 "hash/crc32" 20 "io" 21 "io/ioutil" 22 "os" 23 "path/filepath" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 bolt "github.com/coreos/bbolt" 29 "github.com/coreos/pkg/capnslog" 30 ) 31 32 var ( 33 defaultBatchLimit = 10000 34 defaultBatchInterval = 100 * time.Millisecond 35 36 defragLimit = 10000 37 38 // initialMmapSize is the initial size of the mmapped region. Setting this larger than 39 // the potential max db size can prevent writer from blocking reader. 40 // This only works for linux. 41 initialMmapSize = uint64(10 * 1024 * 1024 * 1024) 42 43 plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc/backend") 44 45 // minSnapshotWarningTimeout is the minimum threshold to trigger a long running snapshot warning. 46 minSnapshotWarningTimeout = time.Duration(30 * time.Second) 47 ) 48 49 type Backend interface { 50 ReadTx() ReadTx 51 BatchTx() BatchTx 52 53 Snapshot() Snapshot 54 Hash(ignores map[IgnoreKey]struct{}) (uint32, error) 55 // Size returns the current size of the backend. 56 Size() int64 57 // SizeInUse returns the current size of the backend logically in use. 58 // Since the backend can manage free space in a non-byte unit such as 59 // number of pages, the returned value can be not exactly accurate in bytes. 60 SizeInUse() int64 61 Defrag() error 62 ForceCommit() 63 Close() error 64 } 65 66 type Snapshot interface { 67 // Size gets the size of the snapshot. 68 Size() int64 69 // WriteTo writes the snapshot into the given writer. 70 WriteTo(w io.Writer) (n int64, err error) 71 // Close closes the snapshot. 72 Close() error 73 } 74 75 type backend struct { 76 // size and commits are used with atomic operations so they must be 77 // 64-bit aligned, otherwise 32-bit tests will crash 78 79 // size is the number of bytes in the backend 80 size int64 81 82 // sizeInUse is the number of bytes actually used in the backend 83 sizeInUse int64 84 85 // commits counts number of commits since start 86 commits int64 87 88 mu sync.RWMutex 89 db *bolt.DB 90 91 batchInterval time.Duration 92 batchLimit int 93 batchTx *batchTxBuffered 94 95 readTx *readTx 96 97 stopc chan struct{} 98 donec chan struct{} 99 } 100 101 type BackendConfig struct { 102 // Path is the file path to the backend file. 103 Path string 104 // BatchInterval is the maximum time before flushing the BatchTx. 105 BatchInterval time.Duration 106 // BatchLimit is the maximum puts before flushing the BatchTx. 107 BatchLimit int 108 // MmapSize is the number of bytes to mmap for the backend. 109 MmapSize uint64 110 } 111 112 func DefaultBackendConfig() BackendConfig { 113 return BackendConfig{ 114 BatchInterval: defaultBatchInterval, 115 BatchLimit: defaultBatchLimit, 116 MmapSize: initialMmapSize, 117 } 118 } 119 120 func New(bcfg BackendConfig) Backend { 121 return newBackend(bcfg) 122 } 123 124 func NewDefaultBackend(path string) Backend { 125 bcfg := DefaultBackendConfig() 126 bcfg.Path = path 127 return newBackend(bcfg) 128 } 129 130 func newBackend(bcfg BackendConfig) *backend { 131 bopts := &bolt.Options{} 132 if boltOpenOptions != nil { 133 *bopts = *boltOpenOptions 134 } 135 bopts.InitialMmapSize = bcfg.mmapSize() 136 137 db, err := bolt.Open(bcfg.Path, 0600, bopts) 138 if err != nil { 139 plog.Panicf("cannot open database at %s (%v)", bcfg.Path, err) 140 } 141 142 // In future, may want to make buffering optional for low-concurrency systems 143 // or dynamically swap between buffered/non-buffered depending on workload. 144 b := &backend{ 145 db: db, 146 147 batchInterval: bcfg.BatchInterval, 148 batchLimit: bcfg.BatchLimit, 149 150 readTx: &readTx{ 151 buf: txReadBuffer{ 152 txBuffer: txBuffer{make(map[string]*bucketBuffer)}, 153 }, 154 buckets: make(map[string]*bolt.Bucket), 155 }, 156 157 stopc: make(chan struct{}), 158 donec: make(chan struct{}), 159 } 160 b.batchTx = newBatchTxBuffered(b) 161 go b.run() 162 return b 163 } 164 165 // BatchTx returns the current batch tx in coalescer. The tx can be used for read and 166 // write operations. The write result can be retrieved within the same tx immediately. 167 // The write result is isolated with other txs until the current one get committed. 168 func (b *backend) BatchTx() BatchTx { 169 return b.batchTx 170 } 171 172 func (b *backend) ReadTx() ReadTx { return b.readTx } 173 174 // ForceCommit forces the current batching tx to commit. 175 func (b *backend) ForceCommit() { 176 b.batchTx.Commit() 177 } 178 179 func (b *backend) Snapshot() Snapshot { 180 b.batchTx.Commit() 181 182 b.mu.RLock() 183 defer b.mu.RUnlock() 184 tx, err := b.db.Begin(false) 185 if err != nil { 186 plog.Fatalf("cannot begin tx (%s)", err) 187 } 188 189 stopc, donec := make(chan struct{}), make(chan struct{}) 190 dbBytes := tx.Size() 191 go func() { 192 defer close(donec) 193 // sendRateBytes is based on transferring snapshot data over a 1 gigabit/s connection 194 // assuming a min tcp throughput of 100MB/s. 195 var sendRateBytes int64 = 100 * 1024 * 1014 196 warningTimeout := time.Duration(int64((float64(dbBytes) / float64(sendRateBytes)) * float64(time.Second))) 197 if warningTimeout < minSnapshotWarningTimeout { 198 warningTimeout = minSnapshotWarningTimeout 199 } 200 start := time.Now() 201 ticker := time.NewTicker(warningTimeout) 202 defer ticker.Stop() 203 for { 204 select { 205 case <-ticker.C: 206 plog.Warningf("snapshotting is taking more than %v seconds to finish transferring %v MB [started at %v]", time.Since(start).Seconds(), float64(dbBytes)/float64(1024*1014), start) 207 case <-stopc: 208 snapshotDurations.Observe(time.Since(start).Seconds()) 209 return 210 } 211 } 212 }() 213 214 return &snapshot{tx, stopc, donec} 215 } 216 217 type IgnoreKey struct { 218 Bucket string 219 Key string 220 } 221 222 func (b *backend) Hash(ignores map[IgnoreKey]struct{}) (uint32, error) { 223 h := crc32.New(crc32.MakeTable(crc32.Castagnoli)) 224 225 b.mu.RLock() 226 defer b.mu.RUnlock() 227 err := b.db.View(func(tx *bolt.Tx) error { 228 c := tx.Cursor() 229 for next, _ := c.First(); next != nil; next, _ = c.Next() { 230 b := tx.Bucket(next) 231 if b == nil { 232 return fmt.Errorf("cannot get hash of bucket %s", string(next)) 233 } 234 h.Write(next) 235 b.ForEach(func(k, v []byte) error { 236 bk := IgnoreKey{Bucket: string(next), Key: string(k)} 237 if _, ok := ignores[bk]; !ok { 238 h.Write(k) 239 h.Write(v) 240 } 241 return nil 242 }) 243 } 244 return nil 245 }) 246 247 if err != nil { 248 return 0, err 249 } 250 251 return h.Sum32(), nil 252 } 253 254 func (b *backend) Size() int64 { 255 return atomic.LoadInt64(&b.size) 256 } 257 258 func (b *backend) SizeInUse() int64 { 259 return atomic.LoadInt64(&b.sizeInUse) 260 } 261 262 func (b *backend) run() { 263 defer close(b.donec) 264 t := time.NewTimer(b.batchInterval) 265 defer t.Stop() 266 for { 267 select { 268 case <-t.C: 269 case <-b.stopc: 270 b.batchTx.CommitAndStop() 271 return 272 } 273 b.batchTx.Commit() 274 t.Reset(b.batchInterval) 275 } 276 } 277 278 func (b *backend) Close() error { 279 close(b.stopc) 280 <-b.donec 281 return b.db.Close() 282 } 283 284 // Commits returns total number of commits since start 285 func (b *backend) Commits() int64 { 286 return atomic.LoadInt64(&b.commits) 287 } 288 289 func (b *backend) Defrag() error { 290 return b.defrag() 291 } 292 293 func (b *backend) defrag() error { 294 now := time.Now() 295 296 // TODO: make this non-blocking? 297 // lock batchTx to ensure nobody is using previous tx, and then 298 // close previous ongoing tx. 299 b.batchTx.Lock() 300 defer b.batchTx.Unlock() 301 302 // lock database after lock tx to avoid deadlock. 303 b.mu.Lock() 304 defer b.mu.Unlock() 305 306 // block concurrent read requests while resetting tx 307 b.readTx.mu.Lock() 308 defer b.readTx.mu.Unlock() 309 310 b.batchTx.unsafeCommit(true) 311 b.batchTx.tx = nil 312 313 // Create a temporary file to ensure we start with a clean slate. 314 // Snapshotter.cleanupSnapdir cleans up any of these that are found during startup. 315 dir := filepath.Dir(b.db.Path()) 316 temp, err := ioutil.TempFile(dir, "db.tmp.*") 317 if err != nil { 318 return err 319 } 320 options := bolt.Options{} 321 if boltOpenOptions != nil { 322 options = *boltOpenOptions 323 } 324 options.OpenFile = func(path string, i int, mode os.FileMode) (file *os.File, err error) { 325 return temp, nil 326 } 327 tdbp := temp.Name() 328 tmpdb, err := bolt.Open(tdbp, 0600, &options) 329 if err != nil { 330 return err 331 } 332 333 // gofail: var defragBeforeCopy struct{} 334 err = defragdb(b.db, tmpdb, defragLimit) 335 336 if err != nil { 337 tmpdb.Close() 338 if rmErr := os.RemoveAll(tmpdb.Path()); rmErr != nil { 339 plog.Fatalf("failed to remove db.tmp after defragmentation completed: %v", rmErr) 340 } 341 return err 342 } 343 344 dbp := b.db.Path() 345 346 err = b.db.Close() 347 if err != nil { 348 plog.Fatalf("cannot close database (%s)", err) 349 } 350 err = tmpdb.Close() 351 if err != nil { 352 plog.Fatalf("cannot close database (%s)", err) 353 } 354 // gofail: var defragBeforeRename struct{} 355 err = os.Rename(tdbp, dbp) 356 if err != nil { 357 plog.Fatalf("cannot rename database (%s)", err) 358 } 359 360 b.db, err = bolt.Open(dbp, 0600, boltOpenOptions) 361 if err != nil { 362 plog.Panicf("cannot open database at %s (%v)", dbp, err) 363 } 364 b.batchTx.tx, err = b.db.Begin(true) 365 if err != nil { 366 plog.Fatalf("cannot begin tx (%s)", err) 367 } 368 369 b.readTx.reset() 370 b.readTx.tx = b.unsafeBegin(false) 371 372 size := b.readTx.tx.Size() 373 db := b.db 374 atomic.StoreInt64(&b.size, size) 375 atomic.StoreInt64(&b.sizeInUse, size-(int64(db.Stats().FreePageN)*int64(db.Info().PageSize))) 376 377 took := time.Since(now) 378 defragDurations.Observe(took.Seconds()) 379 380 return nil 381 } 382 383 func defragdb(odb, tmpdb *bolt.DB, limit int) error { 384 // open a tx on tmpdb for writes 385 tmptx, err := tmpdb.Begin(true) 386 if err != nil { 387 return err 388 } 389 390 // open a tx on old db for read 391 tx, err := odb.Begin(false) 392 if err != nil { 393 return err 394 } 395 defer tx.Rollback() 396 397 c := tx.Cursor() 398 399 count := 0 400 for next, _ := c.First(); next != nil; next, _ = c.Next() { 401 b := tx.Bucket(next) 402 if b == nil { 403 return fmt.Errorf("backend: cannot defrag bucket %s", string(next)) 404 } 405 406 tmpb, berr := tmptx.CreateBucketIfNotExists(next) 407 if berr != nil { 408 return berr 409 } 410 tmpb.FillPercent = 0.9 // for seq write in for each 411 412 b.ForEach(func(k, v []byte) error { 413 count++ 414 if count > limit { 415 err = tmptx.Commit() 416 if err != nil { 417 return err 418 } 419 tmptx, err = tmpdb.Begin(true) 420 if err != nil { 421 return err 422 } 423 tmpb = tmptx.Bucket(next) 424 tmpb.FillPercent = 0.9 // for seq write in for each 425 426 count = 0 427 } 428 return tmpb.Put(k, v) 429 }) 430 } 431 432 return tmptx.Commit() 433 } 434 435 func (b *backend) begin(write bool) *bolt.Tx { 436 b.mu.RLock() 437 tx := b.unsafeBegin(write) 438 b.mu.RUnlock() 439 440 size := tx.Size() 441 db := tx.DB() 442 atomic.StoreInt64(&b.size, size) 443 atomic.StoreInt64(&b.sizeInUse, size-(int64(db.Stats().FreePageN)*int64(db.Info().PageSize))) 444 445 return tx 446 } 447 448 func (b *backend) unsafeBegin(write bool) *bolt.Tx { 449 tx, err := b.db.Begin(write) 450 if err != nil { 451 plog.Fatalf("cannot begin tx (%s)", err) 452 } 453 return tx 454 } 455 456 // NewTmpBackend creates a backend implementation for testing. 457 func NewTmpBackend(batchInterval time.Duration, batchLimit int) (*backend, string) { 458 dir, err := ioutil.TempDir(os.TempDir(), "etcd_backend_test") 459 if err != nil { 460 plog.Fatal(err) 461 } 462 tmpPath := filepath.Join(dir, "database") 463 bcfg := DefaultBackendConfig() 464 bcfg.Path, bcfg.BatchInterval, bcfg.BatchLimit = tmpPath, batchInterval, batchLimit 465 return newBackend(bcfg), tmpPath 466 } 467 468 func NewDefaultTmpBackend() (*backend, string) { 469 return NewTmpBackend(defaultBatchInterval, defaultBatchLimit) 470 } 471 472 type snapshot struct { 473 *bolt.Tx 474 stopc chan struct{} 475 donec chan struct{} 476 } 477 478 func (s *snapshot) Close() error { 479 close(s.stopc) 480 <-s.donec 481 return s.Tx.Rollback() 482 }