go.etcd.io/etcd@v3.3.27+incompatible/clientv3/snapshot/v3_snapshot.go (about)

     1  // Copyright 2018 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package snapshot
    16  
    17  import (
    18  	"context"
    19  	"crypto/sha256"
    20  	"encoding/json"
    21  	"fmt"
    22  	"hash/crc32"
    23  	"io"
    24  	"math"
    25  	"os"
    26  	"path/filepath"
    27  	"reflect"
    28  	"strings"
    29  	"time"
    30  
    31  	bolt "github.com/coreos/bbolt"
    32  	"github.com/coreos/etcd/clientv3"
    33  	"github.com/coreos/etcd/etcdserver"
    34  	"github.com/coreos/etcd/etcdserver/etcdserverpb"
    35  	"github.com/coreos/etcd/etcdserver/membership"
    36  	"github.com/coreos/etcd/lease"
    37  	"github.com/coreos/etcd/mvcc"
    38  	"github.com/coreos/etcd/mvcc/backend"
    39  	"github.com/coreos/etcd/pkg/fileutil"
    40  	"github.com/coreos/etcd/pkg/types"
    41  	"github.com/coreos/etcd/raft"
    42  	"github.com/coreos/etcd/raft/raftpb"
    43  	"github.com/coreos/etcd/snap"
    44  	"github.com/coreos/etcd/store"
    45  	"github.com/coreos/etcd/wal"
    46  	"github.com/coreos/etcd/wal/walpb"
    47  	"github.com/dustin/go-humanize"
    48  	"go.uber.org/zap"
    49  )
    50  
    51  // Manager defines snapshot methods.
    52  type Manager interface {
    53  	// Save fetches snapshot from remote etcd server and saves data
    54  	// to target path. If the context "ctx" is canceled or timed out,
    55  	// snapshot save stream will error out (e.g. context.Canceled,
    56  	// context.DeadlineExceeded). Make sure to specify only one endpoint
    57  	// in client configuration. Snapshot API must be requested to a
    58  	// selected node, and saved snapshot is the point-in-time state of
    59  	// the selected node.
    60  	Save(ctx context.Context, cfg clientv3.Config, dbPath string) error
    61  
    62  	// Status returns the snapshot file information.
    63  	Status(dbPath string) (Status, error)
    64  
    65  	// Restore restores a new etcd data directory from given snapshot
    66  	// file. It returns an error if specified data directory already
    67  	// exists, to prevent unintended data directory overwrites.
    68  	Restore(cfg RestoreConfig) error
    69  }
    70  
    71  // NewV3 returns a new snapshot Manager for v3.x snapshot.
    72  func NewV3(lg *zap.Logger) Manager {
    73  	if lg == nil {
    74  		lg = zap.NewExample()
    75  	}
    76  	return &v3Manager{lg: lg}
    77  }
    78  
    79  type v3Manager struct {
    80  	lg *zap.Logger
    81  
    82  	name    string
    83  	dbPath  string
    84  	walDir  string
    85  	snapDir string
    86  	cl      *membership.RaftCluster
    87  
    88  	skipHashCheck bool
    89  }
    90  
    91  // hasChecksum returns "true" if the file size "n"
    92  // has appended sha256 hash digest.
    93  func hasChecksum(n int64) bool {
    94  	// 512 is chosen because it's a minimum disk sector size
    95  	// smaller than (and multiplies to) OS page size in most systems
    96  	return (n % 512) == sha256.Size
    97  }
    98  
    99  // Save fetches snapshot from remote etcd server and saves data to target path.
   100  func (s *v3Manager) Save(ctx context.Context, cfg clientv3.Config, dbPath string) error {
   101  	if len(cfg.Endpoints) != 1 {
   102  		return fmt.Errorf("snapshot must be requested to one selected node, not multiple %v", cfg.Endpoints)
   103  	}
   104  	cli, err := clientv3.New(cfg)
   105  	if err != nil {
   106  		return err
   107  	}
   108  	defer cli.Close()
   109  
   110  	partpath := dbPath + ".part"
   111  	defer os.RemoveAll(partpath)
   112  
   113  	var f *os.File
   114  	f, err = os.OpenFile(partpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, fileutil.PrivateFileMode)
   115  	if err != nil {
   116  		return fmt.Errorf("could not open %s (%v)", partpath, err)
   117  	}
   118  	s.lg.Info("created temporary db file", zap.String("path", partpath))
   119  
   120  	now := time.Now()
   121  	var rd io.ReadCloser
   122  	rd, err = cli.Snapshot(ctx)
   123  	if err != nil {
   124  		return err
   125  	}
   126  	s.lg.Info("fetching snapshot", zap.String("endpoint", cfg.Endpoints[0]))
   127  	var size int64
   128  	size, err = io.Copy(f, rd)
   129  	if err != nil {
   130  		return err
   131  	}
   132  	if !hasChecksum(size) {
   133  		return fmt.Errorf("sha256 checksum not found [bytes: %d]", size)
   134  	}
   135  	if err = fileutil.Fsync(f); err != nil {
   136  		return err
   137  	}
   138  	if err = f.Close(); err != nil {
   139  		return err
   140  	}
   141  	s.lg.Info(
   142  		"fetched snapshot",
   143  		zap.String("endpoint", cfg.Endpoints[0]),
   144  		zap.String("size", humanize.Bytes(uint64(size))),
   145  		zap.Duration("took", time.Since(now)),
   146  	)
   147  
   148  	if err = os.Rename(partpath, dbPath); err != nil {
   149  		return fmt.Errorf("could not rename %s to %s (%v)", partpath, dbPath, err)
   150  	}
   151  	s.lg.Info("saved", zap.String("path", dbPath))
   152  	return nil
   153  }
   154  
   155  // Status is the snapshot file status.
   156  type Status struct {
   157  	Hash      uint32 `json:"hash"`
   158  	Revision  int64  `json:"revision"`
   159  	TotalKey  int    `json:"totalKey"`
   160  	TotalSize int64  `json:"totalSize"`
   161  }
   162  
   163  // Status returns the snapshot file information.
   164  func (s *v3Manager) Status(dbPath string) (ds Status, err error) {
   165  	if _, err = os.Stat(dbPath); err != nil {
   166  		return ds, err
   167  	}
   168  
   169  	db, err := bolt.Open(dbPath, 0400, &bolt.Options{ReadOnly: true})
   170  	if err != nil {
   171  		return ds, err
   172  	}
   173  	defer db.Close()
   174  
   175  	h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
   176  
   177  	if err = db.View(func(tx *bolt.Tx) error {
   178  		// check snapshot file integrity first
   179  		var dbErrStrings []string
   180  		for dbErr := range tx.Check() {
   181  			dbErrStrings = append(dbErrStrings, dbErr.Error())
   182  		}
   183  		if len(dbErrStrings) > 0 {
   184  			return fmt.Errorf("snapshot file integrity check failed. %d errors found.\n"+strings.Join(dbErrStrings, "\n"), len(dbErrStrings))
   185  		}
   186  		ds.TotalSize = tx.Size()
   187  		c := tx.Cursor()
   188  		for next, _ := c.First(); next != nil; next, _ = c.Next() {
   189  			b := tx.Bucket(next)
   190  			if b == nil {
   191  				return fmt.Errorf("cannot get hash of bucket %s", string(next))
   192  			}
   193  			h.Write(next)
   194  			iskeyb := (string(next) == "key")
   195  			b.ForEach(func(k, v []byte) error {
   196  				h.Write(k)
   197  				h.Write(v)
   198  				if iskeyb {
   199  					rev := bytesToRev(k)
   200  					ds.Revision = rev.main
   201  				}
   202  				ds.TotalKey++
   203  				return nil
   204  			})
   205  		}
   206  		return nil
   207  	}); err != nil {
   208  		return ds, err
   209  	}
   210  
   211  	ds.Hash = h.Sum32()
   212  	return ds, nil
   213  }
   214  
   215  // RestoreConfig configures snapshot restore operation.
   216  type RestoreConfig struct {
   217  	// SnapshotPath is the path of snapshot file to restore from.
   218  	SnapshotPath string
   219  
   220  	// Name is the human-readable name of this member.
   221  	Name string
   222  
   223  	// OutputDataDir is the target data directory to save restored data.
   224  	// OutputDataDir should not conflict with existing etcd data directory.
   225  	// If OutputDataDir already exists, it will return an error to prevent
   226  	// unintended data directory overwrites.
   227  	// If empty, defaults to "[Name].etcd" if not given.
   228  	OutputDataDir string
   229  	// OutputWALDir is the target WAL data directory.
   230  	// If empty, defaults to "[OutputDataDir]/member/wal" if not given.
   231  	OutputWALDir string
   232  
   233  	// PeerURLs is a list of member's peer URLs to advertise to the rest of the cluster.
   234  	PeerURLs []string
   235  
   236  	// InitialCluster is the initial cluster configuration for restore bootstrap.
   237  	InitialCluster string
   238  	// InitialClusterToken is the initial cluster token for etcd cluster during restore bootstrap.
   239  	InitialClusterToken string
   240  
   241  	// SkipHashCheck is "true" to ignore snapshot integrity hash value
   242  	// (required if copied from data directory).
   243  	SkipHashCheck bool
   244  }
   245  
   246  // Restore restores a new etcd data directory from given snapshot file.
   247  func (s *v3Manager) Restore(cfg RestoreConfig) error {
   248  	pURLs, err := types.NewURLs(cfg.PeerURLs)
   249  	if err != nil {
   250  		return err
   251  	}
   252  	var ics types.URLsMap
   253  	ics, err = types.NewURLsMap(cfg.InitialCluster)
   254  	if err != nil {
   255  		return err
   256  	}
   257  
   258  	srv := etcdserver.ServerConfig{
   259  		Name:                cfg.Name,
   260  		PeerURLs:            pURLs,
   261  		InitialPeerURLsMap:  ics,
   262  		InitialClusterToken: cfg.InitialClusterToken,
   263  	}
   264  	if err = srv.VerifyBootstrap(); err != nil {
   265  		return err
   266  	}
   267  
   268  	s.cl, err = membership.NewClusterFromURLsMap(cfg.InitialClusterToken, ics)
   269  	if err != nil {
   270  		return err
   271  	}
   272  
   273  	dataDir := cfg.OutputDataDir
   274  	if dataDir == "" {
   275  		dataDir = cfg.Name + ".etcd"
   276  	}
   277  	if fileutil.Exist(dataDir) {
   278  		return fmt.Errorf("data-dir %q exists", dataDir)
   279  	}
   280  
   281  	walDir := cfg.OutputWALDir
   282  	if walDir == "" {
   283  		walDir = filepath.Join(dataDir, "member", "wal")
   284  	} else if fileutil.Exist(walDir) {
   285  		return fmt.Errorf("wal-dir %q exists", walDir)
   286  	}
   287  
   288  	s.name = cfg.Name
   289  	s.dbPath = cfg.SnapshotPath
   290  	s.walDir = walDir
   291  	s.snapDir = filepath.Join(dataDir, "member", "snap")
   292  	s.skipHashCheck = cfg.SkipHashCheck
   293  
   294  	s.lg.Info(
   295  		"restoring snapshot",
   296  		zap.String("path", s.dbPath),
   297  		zap.String("wal-dir", s.walDir),
   298  		zap.String("data-dir", dataDir),
   299  		zap.String("snap-dir", s.snapDir),
   300  	)
   301  	if err = s.saveDB(); err != nil {
   302  		return err
   303  	}
   304  	if err = s.saveWALAndSnap(); err != nil {
   305  		return err
   306  	}
   307  	s.lg.Info(
   308  		"restored snapshot",
   309  		zap.String("path", s.dbPath),
   310  		zap.String("wal-dir", s.walDir),
   311  		zap.String("data-dir", dataDir),
   312  		zap.String("snap-dir", s.snapDir),
   313  	)
   314  
   315  	return nil
   316  }
   317  
   318  // saveDB copies the database snapshot to the snapshot directory
   319  func (s *v3Manager) saveDB() error {
   320  	f, ferr := os.OpenFile(s.dbPath, os.O_RDONLY, 0600)
   321  	if ferr != nil {
   322  		return ferr
   323  	}
   324  	defer f.Close()
   325  
   326  	// get snapshot integrity hash
   327  	if _, err := f.Seek(-sha256.Size, io.SeekEnd); err != nil {
   328  		return err
   329  	}
   330  	sha := make([]byte, sha256.Size)
   331  	if _, err := f.Read(sha); err != nil {
   332  		return err
   333  	}
   334  	if _, err := f.Seek(0, io.SeekStart); err != nil {
   335  		return err
   336  	}
   337  
   338  	if err := fileutil.CreateDirAll(s.snapDir); err != nil {
   339  		return err
   340  	}
   341  
   342  	dbpath := filepath.Join(s.snapDir, "db")
   343  	db, dberr := os.OpenFile(dbpath, os.O_RDWR|os.O_CREATE, 0600)
   344  	if dberr != nil {
   345  		return dberr
   346  	}
   347  	if _, err := io.Copy(db, f); err != nil {
   348  		return err
   349  	}
   350  
   351  	// truncate away integrity hash, if any.
   352  	off, serr := db.Seek(0, io.SeekEnd)
   353  	if serr != nil {
   354  		return serr
   355  	}
   356  	hasHash := hasChecksum(off)
   357  	if hasHash {
   358  		if err := db.Truncate(off - sha256.Size); err != nil {
   359  			return err
   360  		}
   361  	}
   362  
   363  	if !hasHash && !s.skipHashCheck {
   364  		return fmt.Errorf("snapshot missing hash but --skip-hash-check=false")
   365  	}
   366  
   367  	if hasHash && !s.skipHashCheck {
   368  		// check for match
   369  		if _, err := db.Seek(0, io.SeekStart); err != nil {
   370  			return err
   371  		}
   372  		h := sha256.New()
   373  		if _, err := io.Copy(h, db); err != nil {
   374  			return err
   375  		}
   376  		dbsha := h.Sum(nil)
   377  		if !reflect.DeepEqual(sha, dbsha) {
   378  			return fmt.Errorf("expected sha256 %v, got %v", sha, dbsha)
   379  		}
   380  	}
   381  
   382  	// db hash is OK, can now modify DB so it can be part of a new cluster
   383  	db.Close()
   384  
   385  	commit := len(s.cl.Members())
   386  
   387  	// update consistentIndex so applies go through on etcdserver despite
   388  	// having a new raft instance
   389  	be := backend.NewDefaultBackend(dbpath)
   390  
   391  	// a lessor never timeouts leases
   392  	lessor := lease.NewLessor(be, math.MaxInt64)
   393  
   394  	mvs := mvcc.NewStore(be, lessor, (*initIndex)(&commit))
   395  	txn := mvs.Write()
   396  	btx := be.BatchTx()
   397  	del := func(k, v []byte) error {
   398  		txn.DeleteRange(k, nil)
   399  		return nil
   400  	}
   401  
   402  	// delete stored members from old cluster since using new members
   403  	btx.UnsafeForEach([]byte("members"), del)
   404  
   405  	// todo: add back new members when we start to deprecate old snap file.
   406  	btx.UnsafeForEach([]byte("members_removed"), del)
   407  
   408  	// trigger write-out of new consistent index
   409  	txn.End()
   410  
   411  	mvs.Commit()
   412  	mvs.Close()
   413  	be.Close()
   414  
   415  	return nil
   416  }
   417  
   418  // saveWALAndSnap creates a WAL for the initial cluster
   419  func (s *v3Manager) saveWALAndSnap() error {
   420  	if err := fileutil.CreateDirAll(s.walDir); err != nil {
   421  		return err
   422  	}
   423  
   424  	// add members again to persist them to the store we create.
   425  	st := store.New(etcdserver.StoreClusterPrefix, etcdserver.StoreKeysPrefix)
   426  	s.cl.SetStore(st)
   427  	for _, m := range s.cl.Members() {
   428  		s.cl.AddMember(m)
   429  	}
   430  
   431  	m := s.cl.MemberByName(s.name)
   432  	md := &etcdserverpb.Metadata{NodeID: uint64(m.ID), ClusterID: uint64(s.cl.ID())}
   433  	metadata, merr := md.Marshal()
   434  	if merr != nil {
   435  		return merr
   436  	}
   437  	w, walerr := wal.Create(s.walDir, metadata)
   438  	if walerr != nil {
   439  		return walerr
   440  	}
   441  	defer w.Close()
   442  
   443  	peers := make([]raft.Peer, len(s.cl.MemberIDs()))
   444  	for i, id := range s.cl.MemberIDs() {
   445  		ctx, err := json.Marshal((*s.cl).Member(id))
   446  		if err != nil {
   447  			return err
   448  		}
   449  		peers[i] = raft.Peer{ID: uint64(id), Context: ctx}
   450  	}
   451  
   452  	ents := make([]raftpb.Entry, len(peers))
   453  	nodeIDs := make([]uint64, len(peers))
   454  	for i, p := range peers {
   455  		nodeIDs[i] = p.ID
   456  		cc := raftpb.ConfChange{
   457  			Type:    raftpb.ConfChangeAddNode,
   458  			NodeID:  p.ID,
   459  			Context: p.Context,
   460  		}
   461  		d, err := cc.Marshal()
   462  		if err != nil {
   463  			return err
   464  		}
   465  		ents[i] = raftpb.Entry{
   466  			Type:  raftpb.EntryConfChange,
   467  			Term:  1,
   468  			Index: uint64(i + 1),
   469  			Data:  d,
   470  		}
   471  	}
   472  
   473  	commit, term := uint64(len(ents)), uint64(1)
   474  	if err := w.Save(raftpb.HardState{
   475  		Term:   term,
   476  		Vote:   peers[0].ID,
   477  		Commit: commit,
   478  	}, ents); err != nil {
   479  		return err
   480  	}
   481  
   482  	b, berr := st.Save()
   483  	if berr != nil {
   484  		return berr
   485  	}
   486  	raftSnap := raftpb.Snapshot{
   487  		Data: b,
   488  		Metadata: raftpb.SnapshotMetadata{
   489  			Index: commit,
   490  			Term:  term,
   491  			ConfState: raftpb.ConfState{
   492  				Nodes: nodeIDs,
   493  			},
   494  		},
   495  	}
   496  	sn := snap.New(s.snapDir)
   497  	if err := sn.SaveSnap(raftSnap); err != nil {
   498  		return err
   499  	}
   500  	return w.SaveSnapshot(walpb.Snapshot{Index: commit, Term: term})
   501  }