github.com/weaviate/weaviate@v1.24.6/usecases/backup/coordinator.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package backup 13 14 import ( 15 "context" 16 "errors" 17 "fmt" 18 "slices" 19 "sync" 20 "sync/atomic" 21 "time" 22 23 enterrors "github.com/weaviate/weaviate/entities/errors" 24 25 "github.com/sirupsen/logrus" 26 "github.com/weaviate/weaviate/entities/backup" 27 "github.com/weaviate/weaviate/usecases/config" 28 ) 29 30 // Op is the kind of a backup operation 31 type Op string 32 33 const ( 34 OpCreate Op = "create" 35 OpRestore Op = "restore" 36 ) 37 38 var ( 39 // errNoShardFound = errors.New("no shard found") 40 errCannotCommit = errors.New("cannot commit") 41 errMetaNotFound = errors.New("metadata not found") 42 errUnknownOp = errors.New("unknown backup operation") 43 ) 44 45 const ( 46 _BookingPeriod = time.Second * 20 47 _TimeoutNodeDown = 7 * time.Minute 48 _TimeoutQueryStatus = 5 * time.Second 49 _TimeoutCanCommit = 8 * time.Second 50 _NextRoundPeriod = 10 * time.Second 51 _MaxNumberConns = 16 52 ) 53 54 type nodeMap map[string]*backup.NodeDescriptor 55 56 // participantStatus tracks status of a participant in a DBRO 57 type participantStatus struct { 58 Status backup.Status 59 LastTime time.Time 60 Reason string 61 } 62 63 // selector is used to select participant nodes 64 type selector interface { 65 // Shards gets all nodes on which this class is sharded 66 Shards(ctx context.Context, class string) ([]string, error) 67 // ListClasses returns a list of all existing classes 68 // This will be needed if user doesn't include any classes 69 ListClasses(ctx context.Context) []string 70 71 // Backupable returns whether all given class can be backed up. 72 Backupable(_ context.Context, classes []string) error 73 } 74 75 // coordinator coordinates a distributed backup and restore operation (DBRO): 76 // 77 // - It determines what request to send to which shard. 78 // 79 // - I will return an error, If any shards refuses to participate in DBRO. 80 // 81 // - It keeps all metadata needed to resume a DBRO in an external storage (e.g. s3). 82 // 83 // - When it starts it will check for any broken DBROs using its metadata. 84 // 85 // - It can resume a broken a DBRO 86 // 87 // - It marks the whole DBRO as failed if any shard fails to do its BRO. 88 // 89 // - The coordinator will try to repair previous DBROs whenever it is possible 90 type coordinator struct { 91 // dependencies 92 selector selector 93 client client 94 log logrus.FieldLogger 95 nodeResolver nodeResolver 96 97 // state 98 Participants map[string]participantStatus 99 descriptor *backup.DistributedBackupDescriptor 100 shardSyncChan 101 102 // timeouts 103 timeoutNodeDown time.Duration 104 timeoutQueryStatus time.Duration 105 timeoutCanCommit time.Duration 106 timeoutNextRound time.Duration 107 } 108 109 // newcoordinator creates an instance which coordinates distributed BRO operations among many shards. 110 func newCoordinator( 111 selector selector, 112 client client, 113 log logrus.FieldLogger, 114 nodeResolver nodeResolver, 115 ) *coordinator { 116 return &coordinator{ 117 selector: selector, 118 client: client, 119 log: log, 120 nodeResolver: nodeResolver, 121 Participants: make(map[string]participantStatus, 16), 122 timeoutNodeDown: _TimeoutNodeDown, 123 timeoutQueryStatus: _TimeoutQueryStatus, 124 timeoutCanCommit: _TimeoutCanCommit, 125 timeoutNextRound: _NextRoundPeriod, 126 } 127 } 128 129 // Backup coordinates a distributed backup among participants 130 func (c *coordinator) Backup(ctx context.Context, store coordStore, req *Request) error { 131 req.Method = OpCreate 132 groups, err := c.groupByShard(ctx, req.Classes) 133 if err != nil { 134 return err 135 } 136 // make sure there is no active backup 137 if prevID := c.lastOp.renew(req.ID, store.HomeDir()); prevID != "" { 138 return fmt.Errorf("backup %s already in progress", prevID) 139 } 140 141 c.descriptor = &backup.DistributedBackupDescriptor{ 142 StartedAt: time.Now().UTC(), 143 Status: backup.Started, 144 ID: req.ID, 145 Nodes: groups, 146 Version: Version, 147 ServerVersion: config.ServerVersion, 148 } 149 150 for key := range c.Participants { 151 delete(c.Participants, key) 152 } 153 154 nodes, err := c.canCommit(ctx, req) 155 if err != nil { 156 c.lastOp.reset() 157 return err 158 } 159 160 if err := store.PutMeta(ctx, GlobalBackupFile, c.descriptor); err != nil { 161 c.lastOp.reset() 162 return fmt.Errorf("cannot init meta file: %w", err) 163 } 164 165 statusReq := StatusRequest{ 166 Method: OpCreate, 167 ID: req.ID, 168 Backend: req.Backend, 169 } 170 171 f := func() { 172 defer c.lastOp.reset() 173 ctx := context.Background() 174 c.commit(ctx, &statusReq, nodes, false) 175 logFields := logrus.Fields{"action": OpCreate, "backup_id": req.ID} 176 if err := store.PutMeta(ctx, GlobalBackupFile, c.descriptor); err != nil { 177 c.log.WithFields(logFields).Errorf("coordinator: put_meta: %v", err) 178 } 179 if c.descriptor.Status == backup.Success { 180 c.log.WithFields(logFields).Info("coordinator: backup completed successfully") 181 } else { 182 c.log.WithFields(logFields).Errorf("coordinator: %s", c.descriptor.Error) 183 } 184 } 185 enterrors.GoWrapper(f, c.log) 186 187 return nil 188 } 189 190 // Restore coordinates a distributed restoration among participants 191 func (c *coordinator) Restore( 192 ctx context.Context, 193 store coordStore, 194 req *Request, 195 desc *backup.DistributedBackupDescriptor, 196 ) error { 197 req.Method = OpRestore 198 // make sure there is no active backup 199 if prevID := c.lastOp.renew(desc.ID, store.HomeDir()); prevID != "" { 200 return fmt.Errorf("restoration %s already in progress", prevID) 201 } 202 203 for key := range c.Participants { 204 delete(c.Participants, key) 205 } 206 c.descriptor = desc.ResetStatus() 207 208 nodes, err := c.canCommit(ctx, req) 209 if err != nil { 210 c.lastOp.reset() 211 return err 212 } 213 214 // initial put so restore status is immediately available 215 if err := store.PutMeta(ctx, GlobalRestoreFile, c.descriptor); err != nil { 216 c.lastOp.reset() 217 req := &AbortRequest{Method: OpRestore, ID: desc.ID, Backend: req.Backend} 218 c.abortAll(ctx, req, nodes) 219 return fmt.Errorf("put initial metadata: %w", err) 220 } 221 222 statusReq := StatusRequest{Method: OpRestore, ID: desc.ID, Backend: req.Backend} 223 g := func() { 224 defer c.lastOp.reset() 225 ctx := context.Background() 226 c.commit(ctx, &statusReq, nodes, true) 227 logFields := logrus.Fields{"action": OpRestore, "backup_id": desc.ID} 228 if err := store.PutMeta(ctx, GlobalRestoreFile, c.descriptor); err != nil { 229 c.log.WithFields(logFields).Errorf("coordinator: put_meta: %v", err) 230 } 231 if c.descriptor.Status == backup.Success { 232 c.log.WithFields(logFields).Info("coordinator: backup restored successfully") 233 } else { 234 c.log.WithFields(logFields).Errorf("coordinator: %v", c.descriptor.Error) 235 } 236 } 237 enterrors.GoWrapper(g, c.log) 238 239 return nil 240 } 241 242 func (c *coordinator) OnStatus(ctx context.Context, store coordStore, req *StatusRequest) (*Status, error) { 243 // check if backup is still active 244 st := c.lastOp.get() 245 if st.ID == req.ID { 246 return &Status{Path: st.Path, StartedAt: st.Starttime, Status: st.Status}, nil 247 } 248 filename := GlobalBackupFile 249 if req.Method == OpRestore { 250 filename = GlobalRestoreFile 251 } 252 // The backup might have been already created. 253 meta, err := store.Meta(ctx, filename) 254 if err != nil { 255 path := fmt.Sprintf("%s/%s", req.ID, filename) 256 return nil, fmt.Errorf("coordinator cannot get status: %w: %q: %v", errMetaNotFound, path, err) 257 } 258 259 return &Status{ 260 Path: store.HomeDir(), 261 StartedAt: meta.StartedAt, 262 CompletedAt: meta.CompletedAt, 263 Status: meta.Status, 264 Err: meta.Error, 265 }, nil 266 } 267 268 // canCommit asks candidates if they agree to participate in DBRO 269 // It returns and error if any candidates refuses to participate 270 func (c *coordinator) canCommit(ctx context.Context, req *Request) (map[string]string, error) { 271 ctx, cancel := context.WithTimeout(ctx, c.timeoutCanCommit) 272 defer cancel() 273 274 type nodeHost struct { 275 node, host string 276 } 277 278 type pair struct { 279 n nodeHost 280 r *Request 281 } 282 283 id := c.descriptor.ID 284 nodeMapping := c.descriptor.NodeMapping 285 groups := c.descriptor.Nodes 286 287 g, ctx := enterrors.NewErrorGroupWithContextWrapper(c.log, ctx) 288 g.SetLimit(_MaxNumberConns) 289 reqChan := make(chan pair) 290 g.Go(func() error { 291 defer close(reqChan) 292 for node, gr := range groups { 293 select { 294 case <-ctx.Done(): 295 return ctx.Err() 296 default: 297 } 298 299 // If we have a nodeMapping with the node name from the backup, replace the node with the new one 300 node = c.descriptor.ToMappedNodeName(node) 301 302 host, found := c.nodeResolver.NodeHostname(node) 303 if !found { 304 return fmt.Errorf("cannot resolve hostname for %q", node) 305 } 306 307 reqChan <- pair{ 308 nodeHost{node, host}, 309 &Request{ 310 Method: req.Method, 311 ID: id, 312 Backend: req.Backend, 313 Classes: gr.Classes, 314 Duration: _BookingPeriod, 315 NodeMapping: nodeMapping, 316 Compression: req.Compression, 317 }, 318 } 319 } 320 return nil 321 }) 322 323 mutex := sync.RWMutex{} 324 nodes := make(map[string]string, len(groups)) 325 for pair := range reqChan { 326 pair := pair 327 g.Go(func() error { 328 resp, err := c.client.CanCommit(ctx, pair.n.host, pair.r) 329 if err == nil && resp.Timeout == 0 { 330 err = fmt.Errorf("%w : %v", errCannotCommit, resp.Err) 331 } 332 if err != nil { 333 return fmt.Errorf("node %q: %w", pair.n, err) 334 } 335 mutex.Lock() 336 nodes[pair.n.node] = pair.n.host 337 mutex.Unlock() 338 return nil 339 }) 340 } 341 abortReq := &AbortRequest{Method: req.Method, ID: id, Backend: req.Backend} 342 if err := g.Wait(); err != nil { 343 c.abortAll(ctx, abortReq, nodes) 344 return nil, err 345 } 346 return nodes, nil 347 } 348 349 // commit tells each participant to commit its backup operation 350 // It stores the final result in the provided backend 351 func (c *coordinator) commit(ctx context.Context, 352 req *StatusRequest, 353 node2Addr map[string]string, 354 toleratePartialFailure bool, 355 ) { 356 // create a new copy for commitAll and queryAll to mutate 357 node2Host := make(map[string]string, len(node2Addr)) 358 for k, v := range node2Addr { 359 node2Host[k] = v 360 } 361 nFailures := c.commitAll(ctx, req, node2Host) 362 retryAfter := c.timeoutNextRound / 5 // 2s for first time 363 canContinue := len(node2Host) > 0 && (toleratePartialFailure || nFailures == 0) 364 for canContinue { 365 <-time.After(retryAfter) 366 retryAfter = c.timeoutNextRound 367 nFailures += c.queryAll(ctx, req, node2Host) 368 canContinue = len(node2Host) > 0 && (toleratePartialFailure || nFailures == 0) 369 } 370 if !toleratePartialFailure && nFailures > 0 { 371 req := &AbortRequest{Method: req.Method, ID: req.ID, Backend: req.Backend} 372 c.abortAll(context.Background(), req, node2Addr) 373 } 374 c.descriptor.CompletedAt = time.Now().UTC() 375 status := backup.Success 376 reason := "" 377 groups := c.descriptor.Nodes 378 for node, p := range c.Participants { 379 st := groups[c.descriptor.ToOriginalNodeName(node)] 380 st.Status, st.Error = p.Status, p.Reason 381 if p.Status != backup.Success { 382 status = backup.Failed 383 reason = p.Reason 384 } 385 groups[node] = st 386 } 387 c.descriptor.Status = status 388 c.descriptor.Error = reason 389 } 390 391 // queryAll queries all participant and store their statuses internally 392 // 393 // It returns the number of failed node backups 394 func (c *coordinator) queryAll(ctx context.Context, req *StatusRequest, nodes map[string]string) int { 395 ctx, cancel := context.WithTimeout(ctx, c.timeoutQueryStatus) 396 defer cancel() 397 398 rs := make([]partialStatus, len(nodes)) 399 g, ctx := enterrors.NewErrorGroupWithContextWrapper(c.log, ctx) 400 g.SetLimit(_MaxNumberConns) 401 i := 0 402 for node, hostname := range nodes { 403 j := i 404 hostname := hostname 405 rs[j].node = node 406 g.Go(func() error { 407 rs[j].StatusResponse, rs[j].err = c.client.Status(ctx, hostname, req) 408 return nil 409 }) 410 i++ 411 } 412 g.Wait() 413 n, now := 0, time.Now() 414 for _, r := range rs { 415 st := c.Participants[r.node] 416 if r.err == nil { 417 st.LastTime, st.Status, st.Reason = now, r.Status, r.Err 418 if r.Status == backup.Success { 419 delete(nodes, r.node) 420 } 421 if r.Status == backup.Failed { 422 delete(nodes, r.node) 423 n++ 424 } 425 } else if now.Sub(st.LastTime) > c.timeoutNodeDown { 426 n++ 427 st.Status = backup.Failed 428 st.Reason = fmt.Sprintf("node %q might be down: %v", r.node, r.err.Error()) 429 delete(nodes, r.node) 430 } 431 c.Participants[r.node] = st 432 } 433 return n 434 } 435 436 // commitAll tells all participants to proceed with their backup operations 437 // It returns the number of failures 438 func (c *coordinator) commitAll(ctx context.Context, req *StatusRequest, nodes map[string]string) int { 439 type pair struct { 440 node string 441 err error 442 } 443 errChan := make(chan pair) 444 aCounter := int64(len(nodes)) 445 g, ctx := enterrors.NewErrorGroupWithContextWrapper(c.log, ctx) 446 g.SetLimit(_MaxNumberConns) 447 for node, hostname := range nodes { 448 node, hostname := node, hostname 449 g.Go(func() error { 450 defer func() { 451 if atomic.AddInt64(&aCounter, -1) == 0 { 452 close(errChan) 453 } 454 }() 455 err := c.client.Commit(ctx, hostname, req) 456 if err != nil { 457 errChan <- pair{node, err} 458 } 459 return nil 460 }) 461 } 462 nFailures := 0 463 for x := range errChan { 464 st := c.Participants[x.node] 465 st.Status = backup.Failed 466 st.Reason = "might be down:" + x.err.Error() 467 c.Participants[x.node] = st 468 c.log.WithField("action", req.Method). 469 WithField("backup_id", req.ID). 470 WithField("node", x.node).Error(x.err) 471 delete(nodes, x.node) 472 nFailures++ 473 continue 474 } 475 return nFailures 476 } 477 478 // abortAll tells every node to abort transaction 479 func (c *coordinator) abortAll(ctx context.Context, req *AbortRequest, nodes map[string]string) { 480 for name, hostname := range nodes { 481 if err := c.client.Abort(ctx, hostname, req); err != nil { 482 c.log.WithField("action", req.Method). 483 WithField("backup_id", req.ID). 484 WithField("node", name).Errorf("abort %v", err) 485 } 486 } 487 } 488 489 // groupByShard returns classes group by nodes 490 func (c *coordinator) groupByShard(ctx context.Context, classes []string) (nodeMap, error) { 491 nodes := c.nodeResolver.AllNames() 492 m := make(nodeMap, len(nodes)) 493 for _, node := range nodes { 494 m[node] = &backup.NodeDescriptor{Classes: slices.Clone(classes)} 495 } 496 return m, nil 497 } 498 499 // partialStatus tracks status of a single backup operation 500 type partialStatus struct { 501 node string 502 *StatusResponse 503 err error 504 }