go.etcd.io/etcd@v3.3.27+incompatible/clientv3/snapshot/v3_snapshot.go (about) 1 // Copyright 2018 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package snapshot 16 17 import ( 18 "context" 19 "crypto/sha256" 20 "encoding/json" 21 "fmt" 22 "hash/crc32" 23 "io" 24 "math" 25 "os" 26 "path/filepath" 27 "reflect" 28 "strings" 29 "time" 30 31 bolt "github.com/coreos/bbolt" 32 "github.com/coreos/etcd/clientv3" 33 "github.com/coreos/etcd/etcdserver" 34 "github.com/coreos/etcd/etcdserver/etcdserverpb" 35 "github.com/coreos/etcd/etcdserver/membership" 36 "github.com/coreos/etcd/lease" 37 "github.com/coreos/etcd/mvcc" 38 "github.com/coreos/etcd/mvcc/backend" 39 "github.com/coreos/etcd/pkg/fileutil" 40 "github.com/coreos/etcd/pkg/types" 41 "github.com/coreos/etcd/raft" 42 "github.com/coreos/etcd/raft/raftpb" 43 "github.com/coreos/etcd/snap" 44 "github.com/coreos/etcd/store" 45 "github.com/coreos/etcd/wal" 46 "github.com/coreos/etcd/wal/walpb" 47 "github.com/dustin/go-humanize" 48 "go.uber.org/zap" 49 ) 50 51 // Manager defines snapshot methods. 52 type Manager interface { 53 // Save fetches snapshot from remote etcd server and saves data 54 // to target path. If the context "ctx" is canceled or timed out, 55 // snapshot save stream will error out (e.g. context.Canceled, 56 // context.DeadlineExceeded). Make sure to specify only one endpoint 57 // in client configuration. Snapshot API must be requested to a 58 // selected node, and saved snapshot is the point-in-time state of 59 // the selected node. 60 Save(ctx context.Context, cfg clientv3.Config, dbPath string) error 61 62 // Status returns the snapshot file information. 63 Status(dbPath string) (Status, error) 64 65 // Restore restores a new etcd data directory from given snapshot 66 // file. It returns an error if specified data directory already 67 // exists, to prevent unintended data directory overwrites. 68 Restore(cfg RestoreConfig) error 69 } 70 71 // NewV3 returns a new snapshot Manager for v3.x snapshot. 72 func NewV3(lg *zap.Logger) Manager { 73 if lg == nil { 74 lg = zap.NewExample() 75 } 76 return &v3Manager{lg: lg} 77 } 78 79 type v3Manager struct { 80 lg *zap.Logger 81 82 name string 83 dbPath string 84 walDir string 85 snapDir string 86 cl *membership.RaftCluster 87 88 skipHashCheck bool 89 } 90 91 // hasChecksum returns "true" if the file size "n" 92 // has appended sha256 hash digest. 93 func hasChecksum(n int64) bool { 94 // 512 is chosen because it's a minimum disk sector size 95 // smaller than (and multiplies to) OS page size in most systems 96 return (n % 512) == sha256.Size 97 } 98 99 // Save fetches snapshot from remote etcd server and saves data to target path. 100 func (s *v3Manager) Save(ctx context.Context, cfg clientv3.Config, dbPath string) error { 101 if len(cfg.Endpoints) != 1 { 102 return fmt.Errorf("snapshot must be requested to one selected node, not multiple %v", cfg.Endpoints) 103 } 104 cli, err := clientv3.New(cfg) 105 if err != nil { 106 return err 107 } 108 defer cli.Close() 109 110 partpath := dbPath + ".part" 111 defer os.RemoveAll(partpath) 112 113 var f *os.File 114 f, err = os.OpenFile(partpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, fileutil.PrivateFileMode) 115 if err != nil { 116 return fmt.Errorf("could not open %s (%v)", partpath, err) 117 } 118 s.lg.Info("created temporary db file", zap.String("path", partpath)) 119 120 now := time.Now() 121 var rd io.ReadCloser 122 rd, err = cli.Snapshot(ctx) 123 if err != nil { 124 return err 125 } 126 s.lg.Info("fetching snapshot", zap.String("endpoint", cfg.Endpoints[0])) 127 var size int64 128 size, err = io.Copy(f, rd) 129 if err != nil { 130 return err 131 } 132 if !hasChecksum(size) { 133 return fmt.Errorf("sha256 checksum not found [bytes: %d]", size) 134 } 135 if err = fileutil.Fsync(f); err != nil { 136 return err 137 } 138 if err = f.Close(); err != nil { 139 return err 140 } 141 s.lg.Info( 142 "fetched snapshot", 143 zap.String("endpoint", cfg.Endpoints[0]), 144 zap.String("size", humanize.Bytes(uint64(size))), 145 zap.Duration("took", time.Since(now)), 146 ) 147 148 if err = os.Rename(partpath, dbPath); err != nil { 149 return fmt.Errorf("could not rename %s to %s (%v)", partpath, dbPath, err) 150 } 151 s.lg.Info("saved", zap.String("path", dbPath)) 152 return nil 153 } 154 155 // Status is the snapshot file status. 156 type Status struct { 157 Hash uint32 `json:"hash"` 158 Revision int64 `json:"revision"` 159 TotalKey int `json:"totalKey"` 160 TotalSize int64 `json:"totalSize"` 161 } 162 163 // Status returns the snapshot file information. 164 func (s *v3Manager) Status(dbPath string) (ds Status, err error) { 165 if _, err = os.Stat(dbPath); err != nil { 166 return ds, err 167 } 168 169 db, err := bolt.Open(dbPath, 0400, &bolt.Options{ReadOnly: true}) 170 if err != nil { 171 return ds, err 172 } 173 defer db.Close() 174 175 h := crc32.New(crc32.MakeTable(crc32.Castagnoli)) 176 177 if err = db.View(func(tx *bolt.Tx) error { 178 // check snapshot file integrity first 179 var dbErrStrings []string 180 for dbErr := range tx.Check() { 181 dbErrStrings = append(dbErrStrings, dbErr.Error()) 182 } 183 if len(dbErrStrings) > 0 { 184 return fmt.Errorf("snapshot file integrity check failed. %d errors found.\n"+strings.Join(dbErrStrings, "\n"), len(dbErrStrings)) 185 } 186 ds.TotalSize = tx.Size() 187 c := tx.Cursor() 188 for next, _ := c.First(); next != nil; next, _ = c.Next() { 189 b := tx.Bucket(next) 190 if b == nil { 191 return fmt.Errorf("cannot get hash of bucket %s", string(next)) 192 } 193 h.Write(next) 194 iskeyb := (string(next) == "key") 195 b.ForEach(func(k, v []byte) error { 196 h.Write(k) 197 h.Write(v) 198 if iskeyb { 199 rev := bytesToRev(k) 200 ds.Revision = rev.main 201 } 202 ds.TotalKey++ 203 return nil 204 }) 205 } 206 return nil 207 }); err != nil { 208 return ds, err 209 } 210 211 ds.Hash = h.Sum32() 212 return ds, nil 213 } 214 215 // RestoreConfig configures snapshot restore operation. 216 type RestoreConfig struct { 217 // SnapshotPath is the path of snapshot file to restore from. 218 SnapshotPath string 219 220 // Name is the human-readable name of this member. 221 Name string 222 223 // OutputDataDir is the target data directory to save restored data. 224 // OutputDataDir should not conflict with existing etcd data directory. 225 // If OutputDataDir already exists, it will return an error to prevent 226 // unintended data directory overwrites. 227 // If empty, defaults to "[Name].etcd" if not given. 228 OutputDataDir string 229 // OutputWALDir is the target WAL data directory. 230 // If empty, defaults to "[OutputDataDir]/member/wal" if not given. 231 OutputWALDir string 232 233 // PeerURLs is a list of member's peer URLs to advertise to the rest of the cluster. 234 PeerURLs []string 235 236 // InitialCluster is the initial cluster configuration for restore bootstrap. 237 InitialCluster string 238 // InitialClusterToken is the initial cluster token for etcd cluster during restore bootstrap. 239 InitialClusterToken string 240 241 // SkipHashCheck is "true" to ignore snapshot integrity hash value 242 // (required if copied from data directory). 243 SkipHashCheck bool 244 } 245 246 // Restore restores a new etcd data directory from given snapshot file. 247 func (s *v3Manager) Restore(cfg RestoreConfig) error { 248 pURLs, err := types.NewURLs(cfg.PeerURLs) 249 if err != nil { 250 return err 251 } 252 var ics types.URLsMap 253 ics, err = types.NewURLsMap(cfg.InitialCluster) 254 if err != nil { 255 return err 256 } 257 258 srv := etcdserver.ServerConfig{ 259 Name: cfg.Name, 260 PeerURLs: pURLs, 261 InitialPeerURLsMap: ics, 262 InitialClusterToken: cfg.InitialClusterToken, 263 } 264 if err = srv.VerifyBootstrap(); err != nil { 265 return err 266 } 267 268 s.cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, ics) 269 if err != nil { 270 return err 271 } 272 273 dataDir := cfg.OutputDataDir 274 if dataDir == "" { 275 dataDir = cfg.Name + ".etcd" 276 } 277 if fileutil.Exist(dataDir) { 278 return fmt.Errorf("data-dir %q exists", dataDir) 279 } 280 281 walDir := cfg.OutputWALDir 282 if walDir == "" { 283 walDir = filepath.Join(dataDir, "member", "wal") 284 } else if fileutil.Exist(walDir) { 285 return fmt.Errorf("wal-dir %q exists", walDir) 286 } 287 288 s.name = cfg.Name 289 s.dbPath = cfg.SnapshotPath 290 s.walDir = walDir 291 s.snapDir = filepath.Join(dataDir, "member", "snap") 292 s.skipHashCheck = cfg.SkipHashCheck 293 294 s.lg.Info( 295 "restoring snapshot", 296 zap.String("path", s.dbPath), 297 zap.String("wal-dir", s.walDir), 298 zap.String("data-dir", dataDir), 299 zap.String("snap-dir", s.snapDir), 300 ) 301 if err = s.saveDB(); err != nil { 302 return err 303 } 304 if err = s.saveWALAndSnap(); err != nil { 305 return err 306 } 307 s.lg.Info( 308 "restored snapshot", 309 zap.String("path", s.dbPath), 310 zap.String("wal-dir", s.walDir), 311 zap.String("data-dir", dataDir), 312 zap.String("snap-dir", s.snapDir), 313 ) 314 315 return nil 316 } 317 318 // saveDB copies the database snapshot to the snapshot directory 319 func (s *v3Manager) saveDB() error { 320 f, ferr := os.OpenFile(s.dbPath, os.O_RDONLY, 0600) 321 if ferr != nil { 322 return ferr 323 } 324 defer f.Close() 325 326 // get snapshot integrity hash 327 if _, err := f.Seek(-sha256.Size, io.SeekEnd); err != nil { 328 return err 329 } 330 sha := make([]byte, sha256.Size) 331 if _, err := f.Read(sha); err != nil { 332 return err 333 } 334 if _, err := f.Seek(0, io.SeekStart); err != nil { 335 return err 336 } 337 338 if err := fileutil.CreateDirAll(s.snapDir); err != nil { 339 return err 340 } 341 342 dbpath := filepath.Join(s.snapDir, "db") 343 db, dberr := os.OpenFile(dbpath, os.O_RDWR|os.O_CREATE, 0600) 344 if dberr != nil { 345 return dberr 346 } 347 if _, err := io.Copy(db, f); err != nil { 348 return err 349 } 350 351 // truncate away integrity hash, if any. 352 off, serr := db.Seek(0, io.SeekEnd) 353 if serr != nil { 354 return serr 355 } 356 hasHash := hasChecksum(off) 357 if hasHash { 358 if err := db.Truncate(off - sha256.Size); err != nil { 359 return err 360 } 361 } 362 363 if !hasHash && !s.skipHashCheck { 364 return fmt.Errorf("snapshot missing hash but --skip-hash-check=false") 365 } 366 367 if hasHash && !s.skipHashCheck { 368 // check for match 369 if _, err := db.Seek(0, io.SeekStart); err != nil { 370 return err 371 } 372 h := sha256.New() 373 if _, err := io.Copy(h, db); err != nil { 374 return err 375 } 376 dbsha := h.Sum(nil) 377 if !reflect.DeepEqual(sha, dbsha) { 378 return fmt.Errorf("expected sha256 %v, got %v", sha, dbsha) 379 } 380 } 381 382 // db hash is OK, can now modify DB so it can be part of a new cluster 383 db.Close() 384 385 commit := len(s.cl.Members()) 386 387 // update consistentIndex so applies go through on etcdserver despite 388 // having a new raft instance 389 be := backend.NewDefaultBackend(dbpath) 390 391 // a lessor never timeouts leases 392 lessor := lease.NewLessor(be, math.MaxInt64) 393 394 mvs := mvcc.NewStore(be, lessor, (*initIndex)(&commit)) 395 txn := mvs.Write() 396 btx := be.BatchTx() 397 del := func(k, v []byte) error { 398 txn.DeleteRange(k, nil) 399 return nil 400 } 401 402 // delete stored members from old cluster since using new members 403 btx.UnsafeForEach([]byte("members"), del) 404 405 // todo: add back new members when we start to deprecate old snap file. 406 btx.UnsafeForEach([]byte("members_removed"), del) 407 408 // trigger write-out of new consistent index 409 txn.End() 410 411 mvs.Commit() 412 mvs.Close() 413 be.Close() 414 415 return nil 416 } 417 418 // saveWALAndSnap creates a WAL for the initial cluster 419 func (s *v3Manager) saveWALAndSnap() error { 420 if err := fileutil.CreateDirAll(s.walDir); err != nil { 421 return err 422 } 423 424 // add members again to persist them to the store we create. 425 st := store.New(etcdserver.StoreClusterPrefix, etcdserver.StoreKeysPrefix) 426 s.cl.SetStore(st) 427 for _, m := range s.cl.Members() { 428 s.cl.AddMember(m) 429 } 430 431 m := s.cl.MemberByName(s.name) 432 md := &etcdserverpb.Metadata{NodeID: uint64(m.ID), ClusterID: uint64(s.cl.ID())} 433 metadata, merr := md.Marshal() 434 if merr != nil { 435 return merr 436 } 437 w, walerr := wal.Create(s.walDir, metadata) 438 if walerr != nil { 439 return walerr 440 } 441 defer w.Close() 442 443 peers := make([]raft.Peer, len(s.cl.MemberIDs())) 444 for i, id := range s.cl.MemberIDs() { 445 ctx, err := json.Marshal((*s.cl).Member(id)) 446 if err != nil { 447 return err 448 } 449 peers[i] = raft.Peer{ID: uint64(id), Context: ctx} 450 } 451 452 ents := make([]raftpb.Entry, len(peers)) 453 nodeIDs := make([]uint64, len(peers)) 454 for i, p := range peers { 455 nodeIDs[i] = p.ID 456 cc := raftpb.ConfChange{ 457 Type: raftpb.ConfChangeAddNode, 458 NodeID: p.ID, 459 Context: p.Context, 460 } 461 d, err := cc.Marshal() 462 if err != nil { 463 return err 464 } 465 ents[i] = raftpb.Entry{ 466 Type: raftpb.EntryConfChange, 467 Term: 1, 468 Index: uint64(i + 1), 469 Data: d, 470 } 471 } 472 473 commit, term := uint64(len(ents)), uint64(1) 474 if err := w.Save(raftpb.HardState{ 475 Term: term, 476 Vote: peers[0].ID, 477 Commit: commit, 478 }, ents); err != nil { 479 return err 480 } 481 482 b, berr := st.Save() 483 if berr != nil { 484 return berr 485 } 486 raftSnap := raftpb.Snapshot{ 487 Data: b, 488 Metadata: raftpb.SnapshotMetadata{ 489 Index: commit, 490 Term: term, 491 ConfState: raftpb.ConfState{ 492 Nodes: nodeIDs, 493 }, 494 }, 495 } 496 sn := snap.New(s.snapDir) 497 if err := sn.SaveSnap(raftSnap); err != nil { 498 return err 499 } 500 return w.SaveSnapshot(walpb.Snapshot{Index: commit, Term: term}) 501 }