github.com/weaviate/weaviate@v1.24.6/usecases/backup/coordinator.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package backup
    13  
    14  import (
    15  	"context"
    16  	"errors"
    17  	"fmt"
    18  	"slices"
    19  	"sync"
    20  	"sync/atomic"
    21  	"time"
    22  
    23  	enterrors "github.com/weaviate/weaviate/entities/errors"
    24  
    25  	"github.com/sirupsen/logrus"
    26  	"github.com/weaviate/weaviate/entities/backup"
    27  	"github.com/weaviate/weaviate/usecases/config"
    28  )
    29  
    30  // Op is the kind of a backup operation
    31  type Op string
    32  
    33  const (
    34  	OpCreate  Op = "create"
    35  	OpRestore Op = "restore"
    36  )
    37  
    38  var (
    39  	// errNoShardFound = errors.New("no shard found")
    40  	errCannotCommit = errors.New("cannot commit")
    41  	errMetaNotFound = errors.New("metadata not found")
    42  	errUnknownOp    = errors.New("unknown backup operation")
    43  )
    44  
    45  const (
    46  	_BookingPeriod      = time.Second * 20
    47  	_TimeoutNodeDown    = 7 * time.Minute
    48  	_TimeoutQueryStatus = 5 * time.Second
    49  	_TimeoutCanCommit   = 8 * time.Second
    50  	_NextRoundPeriod    = 10 * time.Second
    51  	_MaxNumberConns     = 16
    52  )
    53  
    54  type nodeMap map[string]*backup.NodeDescriptor
    55  
    56  // participantStatus tracks status of a participant in a DBRO
    57  type participantStatus struct {
    58  	Status   backup.Status
    59  	LastTime time.Time
    60  	Reason   string
    61  }
    62  
    63  // selector is used to select participant nodes
    64  type selector interface {
    65  	// Shards gets all nodes on which this class is sharded
    66  	Shards(ctx context.Context, class string) ([]string, error)
    67  	// ListClasses returns a list of all existing classes
    68  	// This will be needed if user doesn't include any classes
    69  	ListClasses(ctx context.Context) []string
    70  
    71  	// Backupable returns whether all given class can be backed up.
    72  	Backupable(_ context.Context, classes []string) error
    73  }
    74  
    75  // coordinator coordinates a distributed backup and restore operation (DBRO):
    76  //
    77  // - It determines what request to send to which shard.
    78  //
    79  // - I will return an error, If any shards refuses to participate in DBRO.
    80  //
    81  // - It keeps all metadata needed to resume a DBRO in an external storage (e.g. s3).
    82  //
    83  // - When it starts it will check for any broken DBROs using its metadata.
    84  //
    85  // - It can resume a broken a DBRO
    86  //
    87  // - It marks the whole DBRO as failed if any shard fails to do its BRO.
    88  //
    89  // - The coordinator will try to repair previous DBROs whenever it is possible
    90  type coordinator struct {
    91  	// dependencies
    92  	selector     selector
    93  	client       client
    94  	log          logrus.FieldLogger
    95  	nodeResolver nodeResolver
    96  
    97  	// state
    98  	Participants map[string]participantStatus
    99  	descriptor   *backup.DistributedBackupDescriptor
   100  	shardSyncChan
   101  
   102  	// timeouts
   103  	timeoutNodeDown    time.Duration
   104  	timeoutQueryStatus time.Duration
   105  	timeoutCanCommit   time.Duration
   106  	timeoutNextRound   time.Duration
   107  }
   108  
   109  // newcoordinator creates an instance which coordinates distributed BRO operations among many shards.
   110  func newCoordinator(
   111  	selector selector,
   112  	client client,
   113  	log logrus.FieldLogger,
   114  	nodeResolver nodeResolver,
   115  ) *coordinator {
   116  	return &coordinator{
   117  		selector:           selector,
   118  		client:             client,
   119  		log:                log,
   120  		nodeResolver:       nodeResolver,
   121  		Participants:       make(map[string]participantStatus, 16),
   122  		timeoutNodeDown:    _TimeoutNodeDown,
   123  		timeoutQueryStatus: _TimeoutQueryStatus,
   124  		timeoutCanCommit:   _TimeoutCanCommit,
   125  		timeoutNextRound:   _NextRoundPeriod,
   126  	}
   127  }
   128  
   129  // Backup coordinates a distributed backup among participants
   130  func (c *coordinator) Backup(ctx context.Context, store coordStore, req *Request) error {
   131  	req.Method = OpCreate
   132  	groups, err := c.groupByShard(ctx, req.Classes)
   133  	if err != nil {
   134  		return err
   135  	}
   136  	// make sure there is no active backup
   137  	if prevID := c.lastOp.renew(req.ID, store.HomeDir()); prevID != "" {
   138  		return fmt.Errorf("backup %s already in progress", prevID)
   139  	}
   140  
   141  	c.descriptor = &backup.DistributedBackupDescriptor{
   142  		StartedAt:     time.Now().UTC(),
   143  		Status:        backup.Started,
   144  		ID:            req.ID,
   145  		Nodes:         groups,
   146  		Version:       Version,
   147  		ServerVersion: config.ServerVersion,
   148  	}
   149  
   150  	for key := range c.Participants {
   151  		delete(c.Participants, key)
   152  	}
   153  
   154  	nodes, err := c.canCommit(ctx, req)
   155  	if err != nil {
   156  		c.lastOp.reset()
   157  		return err
   158  	}
   159  
   160  	if err := store.PutMeta(ctx, GlobalBackupFile, c.descriptor); err != nil {
   161  		c.lastOp.reset()
   162  		return fmt.Errorf("cannot init meta file: %w", err)
   163  	}
   164  
   165  	statusReq := StatusRequest{
   166  		Method:  OpCreate,
   167  		ID:      req.ID,
   168  		Backend: req.Backend,
   169  	}
   170  
   171  	f := func() {
   172  		defer c.lastOp.reset()
   173  		ctx := context.Background()
   174  		c.commit(ctx, &statusReq, nodes, false)
   175  		logFields := logrus.Fields{"action": OpCreate, "backup_id": req.ID}
   176  		if err := store.PutMeta(ctx, GlobalBackupFile, c.descriptor); err != nil {
   177  			c.log.WithFields(logFields).Errorf("coordinator: put_meta: %v", err)
   178  		}
   179  		if c.descriptor.Status == backup.Success {
   180  			c.log.WithFields(logFields).Info("coordinator: backup completed successfully")
   181  		} else {
   182  			c.log.WithFields(logFields).Errorf("coordinator: %s", c.descriptor.Error)
   183  		}
   184  	}
   185  	enterrors.GoWrapper(f, c.log)
   186  
   187  	return nil
   188  }
   189  
   190  // Restore coordinates a distributed restoration among participants
   191  func (c *coordinator) Restore(
   192  	ctx context.Context,
   193  	store coordStore,
   194  	req *Request,
   195  	desc *backup.DistributedBackupDescriptor,
   196  ) error {
   197  	req.Method = OpRestore
   198  	// make sure there is no active backup
   199  	if prevID := c.lastOp.renew(desc.ID, store.HomeDir()); prevID != "" {
   200  		return fmt.Errorf("restoration %s already in progress", prevID)
   201  	}
   202  
   203  	for key := range c.Participants {
   204  		delete(c.Participants, key)
   205  	}
   206  	c.descriptor = desc.ResetStatus()
   207  
   208  	nodes, err := c.canCommit(ctx, req)
   209  	if err != nil {
   210  		c.lastOp.reset()
   211  		return err
   212  	}
   213  
   214  	// initial put so restore status is immediately available
   215  	if err := store.PutMeta(ctx, GlobalRestoreFile, c.descriptor); err != nil {
   216  		c.lastOp.reset()
   217  		req := &AbortRequest{Method: OpRestore, ID: desc.ID, Backend: req.Backend}
   218  		c.abortAll(ctx, req, nodes)
   219  		return fmt.Errorf("put initial metadata: %w", err)
   220  	}
   221  
   222  	statusReq := StatusRequest{Method: OpRestore, ID: desc.ID, Backend: req.Backend}
   223  	g := func() {
   224  		defer c.lastOp.reset()
   225  		ctx := context.Background()
   226  		c.commit(ctx, &statusReq, nodes, true)
   227  		logFields := logrus.Fields{"action": OpRestore, "backup_id": desc.ID}
   228  		if err := store.PutMeta(ctx, GlobalRestoreFile, c.descriptor); err != nil {
   229  			c.log.WithFields(logFields).Errorf("coordinator: put_meta: %v", err)
   230  		}
   231  		if c.descriptor.Status == backup.Success {
   232  			c.log.WithFields(logFields).Info("coordinator: backup restored successfully")
   233  		} else {
   234  			c.log.WithFields(logFields).Errorf("coordinator: %v", c.descriptor.Error)
   235  		}
   236  	}
   237  	enterrors.GoWrapper(g, c.log)
   238  
   239  	return nil
   240  }
   241  
   242  func (c *coordinator) OnStatus(ctx context.Context, store coordStore, req *StatusRequest) (*Status, error) {
   243  	// check if backup is still active
   244  	st := c.lastOp.get()
   245  	if st.ID == req.ID {
   246  		return &Status{Path: st.Path, StartedAt: st.Starttime, Status: st.Status}, nil
   247  	}
   248  	filename := GlobalBackupFile
   249  	if req.Method == OpRestore {
   250  		filename = GlobalRestoreFile
   251  	}
   252  	// The backup might have been already created.
   253  	meta, err := store.Meta(ctx, filename)
   254  	if err != nil {
   255  		path := fmt.Sprintf("%s/%s", req.ID, filename)
   256  		return nil, fmt.Errorf("coordinator cannot get status: %w: %q: %v", errMetaNotFound, path, err)
   257  	}
   258  
   259  	return &Status{
   260  		Path:        store.HomeDir(),
   261  		StartedAt:   meta.StartedAt,
   262  		CompletedAt: meta.CompletedAt,
   263  		Status:      meta.Status,
   264  		Err:         meta.Error,
   265  	}, nil
   266  }
   267  
   268  // canCommit asks candidates if they agree to participate in DBRO
   269  // It returns and error if any candidates refuses to participate
   270  func (c *coordinator) canCommit(ctx context.Context, req *Request) (map[string]string, error) {
   271  	ctx, cancel := context.WithTimeout(ctx, c.timeoutCanCommit)
   272  	defer cancel()
   273  
   274  	type nodeHost struct {
   275  		node, host string
   276  	}
   277  
   278  	type pair struct {
   279  		n nodeHost
   280  		r *Request
   281  	}
   282  
   283  	id := c.descriptor.ID
   284  	nodeMapping := c.descriptor.NodeMapping
   285  	groups := c.descriptor.Nodes
   286  
   287  	g, ctx := enterrors.NewErrorGroupWithContextWrapper(c.log, ctx)
   288  	g.SetLimit(_MaxNumberConns)
   289  	reqChan := make(chan pair)
   290  	g.Go(func() error {
   291  		defer close(reqChan)
   292  		for node, gr := range groups {
   293  			select {
   294  			case <-ctx.Done():
   295  				return ctx.Err()
   296  			default:
   297  			}
   298  
   299  			// If we have a nodeMapping with the node name from the backup, replace the node with the new one
   300  			node = c.descriptor.ToMappedNodeName(node)
   301  
   302  			host, found := c.nodeResolver.NodeHostname(node)
   303  			if !found {
   304  				return fmt.Errorf("cannot resolve hostname for %q", node)
   305  			}
   306  
   307  			reqChan <- pair{
   308  				nodeHost{node, host},
   309  				&Request{
   310  					Method:      req.Method,
   311  					ID:          id,
   312  					Backend:     req.Backend,
   313  					Classes:     gr.Classes,
   314  					Duration:    _BookingPeriod,
   315  					NodeMapping: nodeMapping,
   316  					Compression: req.Compression,
   317  				},
   318  			}
   319  		}
   320  		return nil
   321  	})
   322  
   323  	mutex := sync.RWMutex{}
   324  	nodes := make(map[string]string, len(groups))
   325  	for pair := range reqChan {
   326  		pair := pair
   327  		g.Go(func() error {
   328  			resp, err := c.client.CanCommit(ctx, pair.n.host, pair.r)
   329  			if err == nil && resp.Timeout == 0 {
   330  				err = fmt.Errorf("%w : %v", errCannotCommit, resp.Err)
   331  			}
   332  			if err != nil {
   333  				return fmt.Errorf("node %q: %w", pair.n, err)
   334  			}
   335  			mutex.Lock()
   336  			nodes[pair.n.node] = pair.n.host
   337  			mutex.Unlock()
   338  			return nil
   339  		})
   340  	}
   341  	abortReq := &AbortRequest{Method: req.Method, ID: id, Backend: req.Backend}
   342  	if err := g.Wait(); err != nil {
   343  		c.abortAll(ctx, abortReq, nodes)
   344  		return nil, err
   345  	}
   346  	return nodes, nil
   347  }
   348  
   349  // commit tells each participant to commit its backup operation
   350  // It stores the final result in the provided backend
   351  func (c *coordinator) commit(ctx context.Context,
   352  	req *StatusRequest,
   353  	node2Addr map[string]string,
   354  	toleratePartialFailure bool,
   355  ) {
   356  	// create a new copy for commitAll and queryAll to mutate
   357  	node2Host := make(map[string]string, len(node2Addr))
   358  	for k, v := range node2Addr {
   359  		node2Host[k] = v
   360  	}
   361  	nFailures := c.commitAll(ctx, req, node2Host)
   362  	retryAfter := c.timeoutNextRound / 5 // 2s for first time
   363  	canContinue := len(node2Host) > 0 && (toleratePartialFailure || nFailures == 0)
   364  	for canContinue {
   365  		<-time.After(retryAfter)
   366  		retryAfter = c.timeoutNextRound
   367  		nFailures += c.queryAll(ctx, req, node2Host)
   368  		canContinue = len(node2Host) > 0 && (toleratePartialFailure || nFailures == 0)
   369  	}
   370  	if !toleratePartialFailure && nFailures > 0 {
   371  		req := &AbortRequest{Method: req.Method, ID: req.ID, Backend: req.Backend}
   372  		c.abortAll(context.Background(), req, node2Addr)
   373  	}
   374  	c.descriptor.CompletedAt = time.Now().UTC()
   375  	status := backup.Success
   376  	reason := ""
   377  	groups := c.descriptor.Nodes
   378  	for node, p := range c.Participants {
   379  		st := groups[c.descriptor.ToOriginalNodeName(node)]
   380  		st.Status, st.Error = p.Status, p.Reason
   381  		if p.Status != backup.Success {
   382  			status = backup.Failed
   383  			reason = p.Reason
   384  		}
   385  		groups[node] = st
   386  	}
   387  	c.descriptor.Status = status
   388  	c.descriptor.Error = reason
   389  }
   390  
   391  // queryAll queries all participant and store their statuses internally
   392  //
   393  // It returns the number of failed node backups
   394  func (c *coordinator) queryAll(ctx context.Context, req *StatusRequest, nodes map[string]string) int {
   395  	ctx, cancel := context.WithTimeout(ctx, c.timeoutQueryStatus)
   396  	defer cancel()
   397  
   398  	rs := make([]partialStatus, len(nodes))
   399  	g, ctx := enterrors.NewErrorGroupWithContextWrapper(c.log, ctx)
   400  	g.SetLimit(_MaxNumberConns)
   401  	i := 0
   402  	for node, hostname := range nodes {
   403  		j := i
   404  		hostname := hostname
   405  		rs[j].node = node
   406  		g.Go(func() error {
   407  			rs[j].StatusResponse, rs[j].err = c.client.Status(ctx, hostname, req)
   408  			return nil
   409  		})
   410  		i++
   411  	}
   412  	g.Wait()
   413  	n, now := 0, time.Now()
   414  	for _, r := range rs {
   415  		st := c.Participants[r.node]
   416  		if r.err == nil {
   417  			st.LastTime, st.Status, st.Reason = now, r.Status, r.Err
   418  			if r.Status == backup.Success {
   419  				delete(nodes, r.node)
   420  			}
   421  			if r.Status == backup.Failed {
   422  				delete(nodes, r.node)
   423  				n++
   424  			}
   425  		} else if now.Sub(st.LastTime) > c.timeoutNodeDown {
   426  			n++
   427  			st.Status = backup.Failed
   428  			st.Reason = fmt.Sprintf("node %q might be down: %v", r.node, r.err.Error())
   429  			delete(nodes, r.node)
   430  		}
   431  		c.Participants[r.node] = st
   432  	}
   433  	return n
   434  }
   435  
   436  // commitAll tells all participants to proceed with their backup operations
   437  // It returns the number of failures
   438  func (c *coordinator) commitAll(ctx context.Context, req *StatusRequest, nodes map[string]string) int {
   439  	type pair struct {
   440  		node string
   441  		err  error
   442  	}
   443  	errChan := make(chan pair)
   444  	aCounter := int64(len(nodes))
   445  	g, ctx := enterrors.NewErrorGroupWithContextWrapper(c.log, ctx)
   446  	g.SetLimit(_MaxNumberConns)
   447  	for node, hostname := range nodes {
   448  		node, hostname := node, hostname
   449  		g.Go(func() error {
   450  			defer func() {
   451  				if atomic.AddInt64(&aCounter, -1) == 0 {
   452  					close(errChan)
   453  				}
   454  			}()
   455  			err := c.client.Commit(ctx, hostname, req)
   456  			if err != nil {
   457  				errChan <- pair{node, err}
   458  			}
   459  			return nil
   460  		})
   461  	}
   462  	nFailures := 0
   463  	for x := range errChan {
   464  		st := c.Participants[x.node]
   465  		st.Status = backup.Failed
   466  		st.Reason = "might be down:" + x.err.Error()
   467  		c.Participants[x.node] = st
   468  		c.log.WithField("action", req.Method).
   469  			WithField("backup_id", req.ID).
   470  			WithField("node", x.node).Error(x.err)
   471  		delete(nodes, x.node)
   472  		nFailures++
   473  		continue
   474  	}
   475  	return nFailures
   476  }
   477  
   478  // abortAll tells every node to abort transaction
   479  func (c *coordinator) abortAll(ctx context.Context, req *AbortRequest, nodes map[string]string) {
   480  	for name, hostname := range nodes {
   481  		if err := c.client.Abort(ctx, hostname, req); err != nil {
   482  			c.log.WithField("action", req.Method).
   483  				WithField("backup_id", req.ID).
   484  				WithField("node", name).Errorf("abort %v", err)
   485  		}
   486  	}
   487  }
   488  
   489  // groupByShard returns classes group by nodes
   490  func (c *coordinator) groupByShard(ctx context.Context, classes []string) (nodeMap, error) {
   491  	nodes := c.nodeResolver.AllNames()
   492  	m := make(nodeMap, len(nodes))
   493  	for _, node := range nodes {
   494  		m[node] = &backup.NodeDescriptor{Classes: slices.Clone(classes)}
   495  	}
   496  	return m, nil
   497  }
   498  
   499  // partialStatus tracks status of a single backup operation
   500  type partialStatus struct {
   501  	node string
   502  	*StatusResponse
   503  	err error
   504  }