github.com/mutagen-io/mutagen@v0.18.0-rc1/pkg/synchronization/controller.go (about) 1 package synchronization 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "os" 8 "sync" 9 "time" 10 11 "google.golang.org/protobuf/proto" 12 "google.golang.org/protobuf/types/known/timestamppb" 13 14 "github.com/mutagen-io/mutagen/pkg/encoding" 15 "github.com/mutagen-io/mutagen/pkg/logging" 16 "github.com/mutagen-io/mutagen/pkg/mutagen" 17 "github.com/mutagen-io/mutagen/pkg/prompting" 18 "github.com/mutagen-io/mutagen/pkg/state" 19 "github.com/mutagen-io/mutagen/pkg/synchronization/core" 20 "github.com/mutagen-io/mutagen/pkg/synchronization/core/ignore" 21 "github.com/mutagen-io/mutagen/pkg/synchronization/rsync" 22 "github.com/mutagen-io/mutagen/pkg/url" 23 ) 24 25 const ( 26 // autoReconnectInterval is the period of time to wait before attempting an 27 // automatic reconnect after disconnection or a failed reconnect. 28 autoReconnectInterval = 15 * time.Second 29 // rescanWaitDuration is the period of time to wait before attempting to 30 // rescan after an ephemeral scan failure. 31 rescanWaitDuration = 5 * time.Second 32 ) 33 34 // controller manages and executes a single session. 35 type controller struct { 36 // logger is the controller logger. 37 logger *logging.Logger 38 // sessionPath is the path to the serialized session. 39 sessionPath string 40 // archivePath is the path to the serialized archive. 41 archivePath string 42 // stateLock guards and tracks changes to session's Paused field, state, and 43 // synchronizing. Previous holders may continue to poll on synchronizing if 44 // they store it in a separate variable before releasing the lock. 45 stateLock *state.TrackingLock 46 // session encodes the associated session metadata. It is considered static 47 // and safe for concurrent access except for its Paused field, for which 48 // stateLock should be held. It should be saved to disk any time it is 49 // modified. 50 session *Session 51 // mergedAlphaConfiguration is the alpha-specific configuration object 52 // (computed from the core configuration and alpha-specific overrides). It 53 // is considered static and safe for concurrent access. It is a derived 54 // field and not saved to disk. 55 mergedAlphaConfiguration *Configuration 56 // mergedBetaConfiguration is the beta-specific configuration object 57 // (computed from the core configuration and beta-specific overrides). It is 58 // considered static and safe for concurrent access. It is a derived field 59 // and not saved to disk. 60 mergedBetaConfiguration *Configuration 61 // state represents the current synchronization state. 62 state *State 63 // synchronizing is used to track whether or not the synchronization loop is 64 // currently in a state where it is capable of performing synchronization. 65 // It is non-nil if and only if the synchronization loop is connected and in 66 // a state where it can perform synchronization. It is closed when 67 // synchronization fails due to an error. 68 synchronizing chan struct{} 69 // lifecycleLock guards access to disabled, cancel, flushRequests, and done. 70 // Only the current holder of the lifecycle lock may set any of these fields 71 // or invoke cancel. The synchronization loop may close close done or 72 // receive from flushRequests without holding the lifecycle lock. Moreover, 73 // previous lifecycle lock holders may continue to send to flushRequests and 74 // poll on done after storing them in separate variables and releasing the 75 // lifecycle lock. Any code wishing to set these fields must first acquire 76 // the lock, then cancel the synchronization loop and wait for it to 77 // complete before making any changes. 78 lifecycleLock sync.Mutex 79 // disabled indicates that no more changes to the synchronization loop 80 // lifecycle are allowed (i.e. no more synchronization loops can be started 81 // for this controller). This is used by terminate and shutdown. It should 82 // only be set to true once any existing synchronization loop has been 83 // stopped. 84 disabled bool 85 // cancel cancels the synchronization loop execution context. It is nil if 86 // and only if there is no synchronization loop running. 87 cancel context.CancelFunc 88 // flushRequests is used pass flush requests to the synchronization loop. It 89 // is buffered, allowing a single request to be queued. All requests passed 90 // via this channel must be buffered and contain room for one error. 91 flushRequests chan chan error 92 // done will be closed by the current synchronization loop when it exits. 93 done chan struct{} 94 } 95 96 // newSession creates a new session and corresponding controller. 97 func newSession( 98 ctx context.Context, 99 logger *logging.Logger, 100 tracker *state.Tracker, 101 identifier string, 102 alpha, beta *url.URL, 103 configuration, configurationAlpha, configurationBeta *Configuration, 104 name string, 105 labels map[string]string, 106 paused bool, 107 prompter string, 108 ) (*controller, error) { 109 // Update status. 110 prompting.Message(prompter, "Creating session...") 111 112 // Set the session version. 113 version := DefaultVersion 114 115 // Compute the creation time and check that it's valid for Protocol Buffers. 116 creationTime := timestamppb.Now() 117 if err := creationTime.CheckValid(); err != nil { 118 return nil, fmt.Errorf("unable to record creation time: %w", err) 119 } 120 121 // Compute merged endpoint configurations. 122 mergedAlphaConfiguration := MergeConfigurations(configuration, configurationAlpha) 123 mergedBetaConfiguration := MergeConfigurations(configuration, configurationBeta) 124 125 // If the session isn't being created paused, then try to connect to the 126 // endpoints. Before doing so, set up a deferred handler that will shut down 127 // any endpoints that aren't handed off to the run loop due to errors. 128 var alphaEndpoint, betaEndpoint Endpoint 129 var err error 130 defer func() { 131 if alphaEndpoint != nil { 132 alphaEndpoint.Shutdown() 133 alphaEndpoint = nil 134 } 135 if betaEndpoint != nil { 136 betaEndpoint.Shutdown() 137 betaEndpoint = nil 138 } 139 }() 140 if !paused { 141 logger.Info("Connecting to alpha endpoint") 142 alphaEndpoint, err = connect( 143 ctx, 144 logger.Sublogger("alpha"), 145 alpha, 146 prompter, 147 identifier, 148 version, 149 mergedAlphaConfiguration, 150 true, 151 ) 152 if err != nil { 153 logger.Info("Alpha connection failure:", err) 154 return nil, fmt.Errorf("unable to connect to alpha: %w", err) 155 } 156 logger.Info("Connecting to beta endpoint") 157 betaEndpoint, err = connect( 158 ctx, 159 logger.Sublogger("beta"), 160 beta, 161 prompter, 162 identifier, 163 version, 164 mergedBetaConfiguration, 165 false, 166 ) 167 if err != nil { 168 logger.Info("Beta connection failure:", err) 169 return nil, fmt.Errorf("unable to connect to beta: %w", err) 170 } 171 } 172 173 // Create the session and initial archive. 174 session := &Session{ 175 Identifier: identifier, 176 Version: version, 177 CreationTime: creationTime, 178 CreatingVersionMajor: mutagen.VersionMajor, 179 CreatingVersionMinor: mutagen.VersionMinor, 180 CreatingVersionPatch: mutagen.VersionPatch, 181 Alpha: alpha, 182 Beta: beta, 183 Configuration: configuration, 184 ConfigurationAlpha: configurationAlpha, 185 ConfigurationBeta: configurationBeta, 186 Name: name, 187 Labels: labels, 188 Paused: paused, 189 } 190 archive := &core.Archive{} 191 192 // Compute the session and archive paths. 193 sessionPath, err := pathForSession(session.Identifier) 194 if err != nil { 195 return nil, fmt.Errorf("unable to compute session path: %w", err) 196 } 197 archivePath, err := pathForArchive(session.Identifier) 198 if err != nil { 199 return nil, fmt.Errorf("unable to compute archive path: %w", err) 200 } 201 202 // Save components to disk. 203 if err := encoding.MarshalAndSaveProtobuf(sessionPath, session); err != nil { 204 return nil, fmt.Errorf("unable to save session: %w", err) 205 } 206 if err := encoding.MarshalAndSaveProtobuf(archivePath, archive); err != nil { 207 os.Remove(sessionPath) 208 return nil, fmt.Errorf("unable to save archive: %w", err) 209 } 210 211 // Create the controller. 212 controller := &controller{ 213 logger: logger, 214 sessionPath: sessionPath, 215 archivePath: archivePath, 216 stateLock: state.NewTrackingLock(tracker), 217 session: session, 218 mergedAlphaConfiguration: mergedAlphaConfiguration, 219 mergedBetaConfiguration: mergedBetaConfiguration, 220 state: &State{ 221 Session: session, 222 AlphaState: &EndpointState{}, 223 BetaState: &EndpointState{}, 224 }, 225 } 226 227 // If the session isn't being created paused, then start a synchronization 228 // loop and mark the endpoints as handed off to that loop so that we don't 229 // defer their shutdown. 230 if !paused { 231 ctx, cancel := context.WithCancel(context.Background()) 232 controller.cancel = cancel 233 controller.flushRequests = make(chan chan error, 1) 234 controller.done = make(chan struct{}) 235 go controller.run(ctx, alphaEndpoint, betaEndpoint) 236 alphaEndpoint = nil 237 betaEndpoint = nil 238 } 239 240 // Success. 241 logger.Info("Session initialized") 242 return controller, nil 243 } 244 245 // loadSession loads an existing session and creates a corresponding controller. 246 func loadSession(logger *logging.Logger, tracker *state.Tracker, identifier string) (*controller, error) { 247 // Compute session and archive paths. 248 sessionPath, err := pathForSession(identifier) 249 if err != nil { 250 return nil, fmt.Errorf("unable to compute session path: %w", err) 251 } 252 archivePath, err := pathForArchive(identifier) 253 if err != nil { 254 return nil, fmt.Errorf("unable to compute archive path: %w", err) 255 } 256 257 // Load and validate the session. We have to populate a few optional fields 258 // before validation if they're not set. We can't do this in the Session 259 // literal because they'll be wiped out during unmarshalling, even if not 260 // set. 261 session := &Session{} 262 if err := encoding.LoadAndUnmarshalProtobuf(sessionPath, session); err != nil { 263 return nil, fmt.Errorf("unable to load session configuration: %w", err) 264 } 265 if session.ConfigurationAlpha == nil { 266 session.ConfigurationAlpha = &Configuration{} 267 } 268 if session.ConfigurationBeta == nil { 269 session.ConfigurationBeta = &Configuration{} 270 } 271 if err := session.EnsureValid(); err != nil { 272 return nil, fmt.Errorf("invalid session found on disk: %w", err) 273 } 274 275 // Create the controller. 276 controller := &controller{ 277 logger: logger, 278 sessionPath: sessionPath, 279 archivePath: archivePath, 280 stateLock: state.NewTrackingLock(tracker), 281 session: session, 282 mergedAlphaConfiguration: MergeConfigurations( 283 session.Configuration, 284 session.ConfigurationAlpha, 285 ), 286 mergedBetaConfiguration: MergeConfigurations( 287 session.Configuration, 288 session.ConfigurationBeta, 289 ), 290 state: &State{ 291 Session: session, 292 AlphaState: &EndpointState{}, 293 BetaState: &EndpointState{}, 294 }, 295 } 296 297 // If the session isn't marked as paused, start a synchronization loop. 298 if !session.Paused { 299 ctx, cancel := context.WithCancel(context.Background()) 300 controller.cancel = cancel 301 controller.flushRequests = make(chan chan error, 1) 302 controller.done = make(chan struct{}) 303 go controller.run(ctx, nil, nil) 304 } 305 306 // Success. 307 logger.Info("Session loaded") 308 return controller, nil 309 } 310 311 // currentState creates a static snapshot of the current session state. 312 func (c *controller) currentState() *State { 313 // Lock the session state and defer its release. It's very important that we 314 // unlock without a notification here, otherwise we'd trigger an infinite 315 // cycle of list/notify. 316 c.stateLock.Lock() 317 defer c.stateLock.UnlockWithoutNotify() 318 319 // Create a static copy of the state. 320 return proto.Clone(c.state).(*State) 321 } 322 323 // flush attempts to force a synchronization cycle for the session. If wait is 324 // specified, then the method will wait until a post-flush synchronization cycle 325 // has completed. The provided context (which must be non-nil) can terminate 326 // this wait early. 327 func (c *controller) flush(ctx context.Context, prompter string, skipWait bool) error { 328 // Update status. 329 prompting.Message(prompter, fmt.Sprintf("Forcing synchronization cycle for session %s...", c.session.Identifier)) 330 331 // Lock the controller's lifecycle. 332 c.lifecycleLock.Lock() 333 334 // Don't allow any operations if the controller is disabled. 335 if c.disabled { 336 c.lifecycleLock.Unlock() 337 return errors.New("controller disabled") 338 } 339 340 // Check if the session is paused. 341 if c.cancel == nil { 342 c.lifecycleLock.Unlock() 343 return errors.New("session is paused") 344 } 345 346 // Perform logging. 347 c.logger.Infof("Forcing synchronization cycle") 348 349 // Check if the session is currently synchronizing and store the channel 350 // that we'll use to track synchronizability. 351 c.stateLock.Lock() 352 synchronizing := c.synchronizing 353 c.stateLock.UnlockWithoutNotify() 354 if synchronizing == nil { 355 c.lifecycleLock.Unlock() 356 return errors.New("session is not currently able to synchronize") 357 } 358 359 // Store the channels that we'll need to submit flush requests and track 360 // synchronization termination. 361 flushRequests := c.flushRequests 362 done := c.done 363 364 // Release the lifecycle lock. 365 c.lifecycleLock.Unlock() 366 367 // Create a flush request. 368 request := make(chan error, 1) 369 370 // If we don't want to wait, then we can simply send the request in a 371 // non-blocking manner, in which case either this request (or one that's 372 // already queued) will be processed eventually. After that, we're done. In 373 // this case, we'll still check for an inability to synchronize, since we 374 // may as well report it if we can. 375 if skipWait { 376 select { 377 case flushRequests <- request: 378 return nil 379 case <-synchronizing: 380 return errors.New("synchronization failed before flush request could be sent") 381 case <-done: 382 return errors.New("synchronization terminated before flush request could be sent") 383 default: 384 return nil 385 } 386 } 387 388 // Otherwise we need to send the request in a blocking manner, watching for 389 // cancellation, failure, or termination. 390 select { 391 case flushRequests <- request: 392 case <-ctx.Done(): 393 return errors.New("flush cancelled before request could be sent") 394 case <-synchronizing: 395 return errors.New("synchronization failed before flush request could be sent") 396 case <-done: 397 return errors.New("synchronization terminated before flush request could be sent") 398 } 399 400 // Now we need to wait for a response to the request, again watching for 401 // cancellation, failure, or termination. 402 select { 403 case err := <-request: 404 return err 405 case <-ctx.Done(): 406 return errors.New("flush cancelled while waiting for response") 407 case <-synchronizing: 408 return errors.New("synchronization failed while waiting for flush response") 409 case <-done: 410 return errors.New("synchronization terminated while waiting for flush response") 411 } 412 } 413 414 // resume attempts to reconnect and resume the session if it isn't currently 415 // connected and synchronizing. If lifecycleLockHeld is true, then halt will 416 // assume that the lifecycle lock is held by the caller and will not attempt to 417 // acquire it. 418 func (c *controller) resume(ctx context.Context, prompter string, lifecycleLockHeld bool) error { 419 // Update status. 420 prompting.Message(prompter, fmt.Sprintf("Resuming session %s...", c.session.Identifier)) 421 422 // If not already held, acquire the lifecycle lock and defer its release. 423 if !lifecycleLockHeld { 424 c.lifecycleLock.Lock() 425 defer c.lifecycleLock.Unlock() 426 } 427 428 // Don't allow any resume operations if the controller is disabled. 429 if c.disabled { 430 return errors.New("controller disabled") 431 } 432 433 // Perform logging. 434 c.logger.Infof("Resuming") 435 436 // Check if there's an existing synchronization loop (i.e. if the session is 437 // unpaused). 438 if c.cancel != nil { 439 // If there is an existing synchronization loop, check if it's already 440 // in a state that's considered "connected". 441 c.stateLock.Lock() 442 connected := c.state.Status >= Status_Watching 443 c.stateLock.UnlockWithoutNotify() 444 445 // If we're already connected, then there's nothing we need to do. We 446 // don't even need to mark the session as unpaused because it can't be 447 // marked as paused if an existing synchronization loop is running (we 448 // enforce this invariant as part of the controller's logic). 449 if connected { 450 return nil 451 } 452 453 // Otherwise, cancel the existing synchronization loop and wait for it 454 // to finish. 455 // 456 // There's something of an efficiency race condition here, because the 457 // existing loop might succeed in connecting between the time we check 458 // and the time we cancel it. That could happen if an auto-reconnect 459 // succeeds or even if the loop was already passed connections and it's 460 // just hasn't updated its status yet. But the only danger here is 461 // basically wasting those connections, and the window is very small. 462 c.cancel() 463 <-c.done 464 465 // Nil out any lifecycle state. 466 c.cancel = nil 467 c.flushRequests = nil 468 c.done = nil 469 } 470 471 // Mark the session as unpaused and save it to disk. 472 c.stateLock.Lock() 473 c.session.Paused = false 474 saveErr := encoding.MarshalAndSaveProtobuf(c.sessionPath, c.session) 475 c.stateLock.Unlock() 476 477 // Attempt to connect to alpha. 478 c.stateLock.Lock() 479 c.state.Status = Status_ConnectingAlpha 480 c.stateLock.Unlock() 481 alpha, alphaConnectErr := connect( 482 ctx, 483 c.logger.Sublogger("alpha"), 484 c.session.Alpha, 485 prompter, 486 c.session.Identifier, 487 c.session.Version, 488 c.mergedAlphaConfiguration, 489 true, 490 ) 491 c.stateLock.Lock() 492 c.state.AlphaState.Connected = (alpha != nil) 493 c.stateLock.Unlock() 494 495 // Attempt to connect to beta. 496 c.stateLock.Lock() 497 c.state.Status = Status_ConnectingBeta 498 c.stateLock.Unlock() 499 beta, betaConnectErr := connect( 500 ctx, 501 c.logger.Sublogger("beta"), 502 c.session.Beta, 503 prompter, 504 c.session.Identifier, 505 c.session.Version, 506 c.mergedBetaConfiguration, 507 false, 508 ) 509 c.stateLock.Lock() 510 c.state.BetaState.Connected = (beta != nil) 511 c.stateLock.Unlock() 512 513 // Start the synchronization loop with what we have. Alpha or beta may have 514 // failed to connect (and be nil), but in any case that'll just make the run 515 // loop keep trying to connect. 516 ctx, cancel := context.WithCancel(context.Background()) 517 c.cancel = cancel 518 c.flushRequests = make(chan chan error, 1) 519 c.done = make(chan struct{}) 520 go c.run(ctx, alpha, beta) 521 522 // Report any errors. Since we always want to start a synchronization loop, 523 // even on partial or complete failure (since it might be able to 524 // auto-reconnect on its own), we wait until the end to report errors. 525 if saveErr != nil { 526 return fmt.Errorf("unable to save session: %w", saveErr) 527 } else if alphaConnectErr != nil { 528 return fmt.Errorf("unable to connect to alpha: %w", alphaConnectErr) 529 } else if betaConnectErr != nil { 530 return fmt.Errorf("unable to connect to beta: %w", betaConnectErr) 531 } 532 533 // Success. 534 return nil 535 } 536 537 // controllerHaltMode represents the behavior to use when halting a session. 538 type controllerHaltMode uint8 539 540 const ( 541 // controllerHaltModePause indicates that a session should be halted and 542 // marked as paused. 543 controllerHaltModePause controllerHaltMode = iota 544 // controllerHaltModeShutdown indicates that a session should be halted. 545 controllerHaltModeShutdown 546 // controllerHaltModeShutdown indicates that a session should be halted and 547 // then deleted. 548 controllerHaltModeTerminate 549 ) 550 551 // description returns a human-readable description of a halt mode. 552 func (m controllerHaltMode) description() string { 553 switch m { 554 case controllerHaltModePause: 555 return "Pausing" 556 case controllerHaltModeShutdown: 557 return "Shutting down" 558 case controllerHaltModeTerminate: 559 return "Terminating" 560 default: 561 panic("unhandled halt mode") 562 } 563 } 564 565 // halt halts the session with the specified behavior. If lifecycleLockHeld is 566 // true, then halt will assume that the lifecycle lock is held by the caller and 567 // will not attempt to acquire it. 568 func (c *controller) halt(_ context.Context, mode controllerHaltMode, prompter string, lifecycleLockHeld bool) error { 569 // Update status. 570 prompting.Message(prompter, fmt.Sprintf("%s session %s...", mode.description(), c.session.Identifier)) 571 572 // If not already held, acquire the lifecycle lock and defer its release. 573 if !lifecycleLockHeld { 574 c.lifecycleLock.Lock() 575 defer c.lifecycleLock.Unlock() 576 } 577 578 // Don't allow any additional halt operations if the controller is disabled, 579 // because either this session is being terminated or the service is 580 // shutting down, and in either case there is no point in halting. 581 if c.disabled { 582 return errors.New("controller disabled") 583 } 584 585 // Perform logging. 586 c.logger.Infof(mode.description()) 587 588 // Kill any existing synchronization loop. 589 if c.cancel != nil { 590 // Cancel the synchronization loop and wait for it to finish. 591 c.cancel() 592 <-c.done 593 594 // Nil out any lifecycle state. 595 c.cancel = nil 596 c.flushRequests = nil 597 c.done = nil 598 } 599 600 // Handle based on the halt mode. 601 if mode == controllerHaltModePause { 602 // Mark the session as paused and save it. 603 c.stateLock.Lock() 604 c.session.Paused = true 605 saveErr := encoding.MarshalAndSaveProtobuf(c.sessionPath, c.session) 606 c.stateLock.Unlock() 607 if saveErr != nil { 608 return fmt.Errorf("unable to save session: %w", saveErr) 609 } 610 } else if mode == controllerHaltModeShutdown { 611 // Disable the controller. 612 c.disabled = true 613 } else if mode == controllerHaltModeTerminate { 614 // Disable the controller. 615 c.disabled = true 616 617 // Wipe the session information from disk. 618 sessionRemoveErr := os.Remove(c.sessionPath) 619 archiveRemoveErr := os.Remove(c.archivePath) 620 if sessionRemoveErr != nil { 621 return fmt.Errorf("unable to remove session from disk: %w", sessionRemoveErr) 622 } else if archiveRemoveErr != nil { 623 return fmt.Errorf("unable to remove archive from disk: %w", archiveRemoveErr) 624 } 625 } else { 626 panic("invalid halt mode specified") 627 } 628 629 // Success. 630 return nil 631 } 632 633 // reset resets synchronization session history by pausing the session (if it's 634 // running), overwriting the ancestor data stored on disk with an empty 635 // ancestor, and then resuming the session (if it was previously running). 636 func (c *controller) reset(ctx context.Context, prompter string) error { 637 // Lock the controller's lifecycle and defer its release. 638 c.lifecycleLock.Lock() 639 defer c.lifecycleLock.Unlock() 640 641 // Check if the session is currently running. 642 running := c.cancel != nil 643 644 // If the session is running, pause it. 645 if running { 646 if err := c.halt(ctx, controllerHaltModePause, prompter, true); err != nil { 647 return fmt.Errorf("unable to pause session: %w", err) 648 } 649 } 650 651 // Reset the session archive on disk. 652 c.logger.Infof("Resetting ancestor") 653 archive := &core.Archive{} 654 if err := encoding.MarshalAndSaveProtobuf(c.archivePath, archive); err != nil { 655 return fmt.Errorf("unable to clear session history: %w", err) 656 } 657 658 // Resume the session if it was previously running. 659 if running { 660 if err := c.resume(ctx, prompter, true); err != nil { 661 return fmt.Errorf("unable to resume session: %w", err) 662 } 663 } 664 665 // Success. 666 return nil 667 } 668 669 var ( 670 // errHaltedForSafety is a sentinel error indicating that a safety check 671 // wants the synchronization loop to be halted until manually resumed. 672 errHaltedForSafety = errors.New("synchronization halted") 673 ) 674 675 // run is the main run loop for the controller, managing connectivity and 676 // synchronization. 677 func (c *controller) run(ctx context.Context, alpha, beta Endpoint) { 678 // Log run loop entry. 679 c.logger.Debug("Run loop commencing") 680 681 // Defer resource and state cleanup. 682 defer func() { 683 // Shutdown any endpoints. These might be non-nil if the run loop was 684 // cancelled while partially connected rather than after sync failure. 685 if alpha != nil { 686 alpha.Shutdown() 687 } 688 if beta != nil { 689 beta.Shutdown() 690 } 691 692 // Reset the state. 693 c.stateLock.Lock() 694 c.state = &State{ 695 Session: c.session, 696 AlphaState: &EndpointState{}, 697 BetaState: &EndpointState{}, 698 } 699 c.stateLock.Unlock() 700 701 // Log run loop termination. 702 c.logger.Debug("Run loop terminated") 703 704 // Signal completion. 705 close(c.done) 706 }() 707 708 // Track the last time that synchronization failed. 709 var lastSynchronizationFailureTime time.Time 710 711 // Loop until cancelled. 712 for { 713 // Loop until we're connected to both endpoints. We do a non-blocking 714 // check for cancellation on each reconnect error so that we don't waste 715 // resources by trying another connect when the context has been 716 // cancelled (it'll be wasteful). This is better than sentinel errors. 717 for { 718 // Ensure that alpha is connected. 719 if alpha == nil { 720 c.stateLock.Lock() 721 c.state.Status = Status_ConnectingAlpha 722 c.stateLock.Unlock() 723 alpha, _ = connect( 724 ctx, 725 c.logger.Sublogger("alpha"), 726 c.session.Alpha, 727 "", 728 c.session.Identifier, 729 c.session.Version, 730 c.mergedAlphaConfiguration, 731 true, 732 ) 733 } 734 c.stateLock.Lock() 735 c.state.AlphaState.Connected = (alpha != nil) 736 c.stateLock.Unlock() 737 738 // Check for cancellation to avoid a spurious connection to beta in 739 // case cancellation occurred while connecting to alpha. 740 select { 741 case <-ctx.Done(): 742 return 743 default: 744 } 745 746 // Ensure that beta is connected. 747 if beta == nil { 748 c.stateLock.Lock() 749 c.state.Status = Status_ConnectingBeta 750 c.stateLock.Unlock() 751 beta, _ = connect( 752 ctx, 753 c.logger.Sublogger("beta"), 754 c.session.Beta, 755 "", 756 c.session.Identifier, 757 c.session.Version, 758 c.mergedBetaConfiguration, 759 false, 760 ) 761 } 762 c.stateLock.Lock() 763 c.state.BetaState.Connected = (beta != nil) 764 c.stateLock.Unlock() 765 766 // If both endpoints are connected, we're done. We perform this 767 // check here (rather than in the loop condition) because if we did 768 // it in the loop condition we'd still need a check here to avoid a 769 // sleep every time (even if already successfully connected). 770 if alpha != nil && beta != nil { 771 break 772 } 773 774 // If we failed to connect, wait and then retry. Watch for 775 // cancellation in the mean time. 776 select { 777 case <-ctx.Done(): 778 return 779 case <-time.After(autoReconnectInterval): 780 } 781 } 782 783 // Indicate that the synchronization loop is entering a state where it 784 // can actually perform synchronization. We don't need to perform any 785 // notification here since this is not a user-visible state change. 786 c.stateLock.Lock() 787 c.synchronizing = make(chan struct{}) 788 c.stateLock.UnlockWithoutNotify() 789 790 // Perform synchronization. 791 c.logger.Debug("Entering synchronization loop") 792 err := c.synchronize(ctx, alpha, beta) 793 c.logger.Debug("Synchronization loop terminated with error:", err) 794 795 // Indicate that the synchronization loop is no longer synchronizing. 796 // Again, no notification is required here since this is not a 797 // user-visible state change. 798 c.stateLock.Lock() 799 close(c.synchronizing) 800 c.synchronizing = nil 801 c.stateLock.UnlockWithoutNotify() 802 803 // Shutdown the endpoints. 804 alpha.Shutdown() 805 alpha = nil 806 beta.Shutdown() 807 beta = nil 808 809 // If synchronization failed due a halting error, then wait for the 810 // synchronization loop to be manually resumed. 811 if err == errHaltedForSafety { 812 <-ctx.Done() 813 return 814 } 815 816 // Otherwise, reset the synchronization state, but propagate the error 817 // that caused failure. 818 c.stateLock.Lock() 819 c.state = &State{ 820 Session: c.session, 821 LastError: err.Error(), 822 AlphaState: &EndpointState{}, 823 BetaState: &EndpointState{}, 824 } 825 c.stateLock.Unlock() 826 827 // If we were cancelled, then return immediately. 828 select { 829 case <-ctx.Done(): 830 return 831 default: 832 } 833 834 // If less than one auto-reconnect interval has elapsed since the last 835 // synchronization failure, then wait before attempting reconnection. 836 now := time.Now() 837 if now.Sub(lastSynchronizationFailureTime) < autoReconnectInterval { 838 select { 839 case <-ctx.Done(): 840 return 841 case <-time.After(autoReconnectInterval): 842 } 843 } 844 lastSynchronizationFailureTime = now 845 } 846 } 847 848 // synchronize is the main synchronization loop for the controller. 849 func (c *controller) synchronize(ctx context.Context, alpha, beta Endpoint) error { 850 // Clear any error state upon restart of this function. If there was a 851 // terminal error previously caused synchronization to fail, then the user 852 // will have had time to review it (while the run loop is waiting to 853 // reconnect), so it's not like we're getting rid of it too quickly. 854 c.stateLock.Lock() 855 if c.state.LastError != "" { 856 c.state.LastError = "" 857 c.stateLock.Unlock() 858 } else { 859 c.stateLock.UnlockWithoutNotify() 860 } 861 862 // Track whether or not a flush request triggered the synchronization loop. 863 var flushRequest chan error 864 865 // Load the archive and extract the ancestor. We enforce that the archive 866 // contains only synchronizable content. 867 archive := &core.Archive{} 868 if err := encoding.LoadAndUnmarshalProtobuf(c.archivePath, archive); err != nil { 869 return fmt.Errorf("unable to load archive: %w", err) 870 } else if err = archive.EnsureValid(true); err != nil { 871 return fmt.Errorf("invalid archive found on disk: %w", err) 872 } 873 ancestor := archive.Content 874 875 // Compute the effective synchronization mode. 876 synchronizationMode := c.session.Configuration.SynchronizationMode 877 if synchronizationMode.IsDefault() { 878 synchronizationMode = c.session.Version.DefaultSynchronizationMode() 879 } 880 881 // Compute the effective ignore syntax. 882 ignoreSyntax := c.session.Configuration.IgnoreSyntax 883 if ignoreSyntax.IsDefault() { 884 ignoreSyntax = c.session.Version.DefaultIgnoreSyntax() 885 } 886 887 // Compute the effective permissions mode. 888 permissionsMode := c.session.Configuration.PermissionsMode 889 if permissionsMode.IsDefault() { 890 permissionsMode = c.session.Version.DefaultPermissionsMode() 891 } 892 893 // Compute, on a per-endpoint basis, whether or not polling should be 894 // disabled. 895 αWatchMode := c.mergedAlphaConfiguration.WatchMode 896 βWatchMode := c.mergedBetaConfiguration.WatchMode 897 if αWatchMode.IsDefault() { 898 αWatchMode = c.session.Version.DefaultWatchMode() 899 } 900 if βWatchMode.IsDefault() { 901 βWatchMode = c.session.Version.DefaultWatchMode() 902 } 903 αDisablePolling := (αWatchMode == WatchMode_WatchModeNoWatch) 904 βDisablePolling := (βWatchMode == WatchMode_WatchModeNoWatch) 905 906 // Create a switch that will allow us to skip polling and force a 907 // synchronization cycle. On startup, we enable this switch and skip polling 908 // to immediately force a check for changes that may have occurred while the 909 // synchronization loop wasn't running. The only time we don't force this 910 // check on startup is when both endpoints have polling disabled, which is 911 // an indication that the session should operate in a fully manual mode. 912 skipPolling := (!αDisablePolling || !βDisablePolling) 913 914 // Create variables to track our reasons for skipping polling. 915 var skippingPollingDueToScanError, skippingPollingDueToMissingFiles bool 916 917 // Loop until there is a synchronization error. 918 for { 919 // Unless we've been requested to skip polling, wait for a dirty state 920 // while monitoring for cancellation. If we've been requested to skip 921 // polling, it should only be for one iteration. 922 if !skipPolling { 923 // Update status to watching. 924 c.stateLock.Lock() 925 c.state.Status = Status_Watching 926 c.stateLock.Unlock() 927 928 // Create a polling context that we can cancel. We don't make it a 929 // subcontext of our own cancellation context because it's easier to 930 // just track cancellation there separately. 931 pollCtx, pollCancel := context.WithCancel(context.Background()) 932 933 // Start alpha polling. If alpha has been put into a no-watch mode, 934 // then we still perform polling in order to detect transport errors 935 // that might occur while the session is sitting idle, but we ignore 936 // any non-error responses and instead wait for the polling context 937 // to be cancelled. We perform this ignore operation because we 938 // don't want a broken or malicious endpoint to be able to force 939 // synchronization, especially if its watching has been 940 // intentionally disabled. 941 // 942 // It's worth noting that, because a well-behaved endpoint in 943 // no-watch mode never returns events, we'll always be polling on it 944 // (and thereby testing the transport) right up until the polling 945 // context is cancelled. Thus, there's no need to worry about cases 946 // where the endpoint sends back an event that we ignore and then 947 // has a transport failure without us noticing while we wait on the 948 // polling context (at least not for well-behaved endpoints). 949 αPollResults := make(chan error, 1) 950 go func() { 951 if αDisablePolling { 952 if err := alpha.Poll(pollCtx); err != nil { 953 αPollResults <- err 954 } else { 955 <-pollCtx.Done() 956 αPollResults <- nil 957 } 958 } else { 959 αPollResults <- alpha.Poll(pollCtx) 960 } 961 }() 962 963 // Start beta polling. The logic here mirrors that for alpha above. 964 βPollResults := make(chan error, 1) 965 go func() { 966 if βDisablePolling { 967 if err := beta.Poll(pollCtx); err != nil { 968 βPollResults <- err 969 } else { 970 <-pollCtx.Done() 971 βPollResults <- nil 972 } 973 } else { 974 βPollResults <- beta.Poll(pollCtx) 975 } 976 }() 977 978 // Wait for either poll to return an event or an error, for a flush 979 // request, or for cancellation. In any of these cases, cancel 980 // polling and ensure that both polling operations have completed. 981 var αPollErr, βPollErr error 982 cancelled := false 983 select { 984 case αPollErr = <-αPollResults: 985 c.logger.Debug("Triggered by alpha endpoint") 986 pollCancel() 987 βPollErr = <-βPollResults 988 case βPollErr = <-βPollResults: 989 c.logger.Debug("Triggered by beta endpoint") 990 pollCancel() 991 αPollErr = <-αPollResults 992 case flushRequest = <-c.flushRequests: 993 if cap(flushRequest) < 1 { 994 panic("unbuffered flush request") 995 } 996 c.logger.Debug("Triggered by flush request") 997 pollCancel() 998 αPollErr = <-αPollResults 999 βPollErr = <-βPollResults 1000 case <-ctx.Done(): 1001 cancelled = true 1002 pollCancel() 1003 αPollErr = <-αPollResults 1004 βPollErr = <-βPollResults 1005 } 1006 1007 // Watch for errors or cancellation. 1008 if cancelled { 1009 return errors.New("cancelled during polling") 1010 } else if αPollErr != nil { 1011 return fmt.Errorf("alpha polling error: %w", αPollErr) 1012 } else if βPollErr != nil { 1013 return fmt.Errorf("beta polling error: %w", βPollErr) 1014 } 1015 } else { 1016 c.logger.Debug("Skipping polling") 1017 skipPolling = false 1018 } 1019 1020 // Scan both endpoints in parallel and check for errors. If a flush 1021 // request is present, then force both endpoints to perform a full 1022 // (warm) re-scan rather than using acceleration. 1023 c.logger.Debug("Scanning endpoints") 1024 c.stateLock.Lock() 1025 c.state.Status = Status_Scanning 1026 c.stateLock.Unlock() 1027 forceFullScan := flushRequest != nil 1028 var αSnapshot, βSnapshot *core.Snapshot 1029 var αScanErr, βScanErr error 1030 var αTryAgain, βTryAgain bool 1031 scanDone := &sync.WaitGroup{} 1032 scanDone.Add(2) 1033 go func() { 1034 αSnapshot, αScanErr, αTryAgain = alpha.Scan(ctx, ancestor, forceFullScan) 1035 scanDone.Done() 1036 }() 1037 go func() { 1038 βSnapshot, βScanErr, βTryAgain = beta.Scan(ctx, ancestor, forceFullScan) 1039 scanDone.Done() 1040 }() 1041 scanDone.Wait() 1042 1043 // Check if cancellation occurred during scanning. 1044 select { 1045 case <-ctx.Done(): 1046 return errors.New("cancelled during scanning") 1047 default: 1048 } 1049 1050 // Check for scan errors. 1051 if αScanErr != nil { 1052 αScanErr = fmt.Errorf("alpha scan error: %w", αScanErr) 1053 if !αTryAgain { 1054 return αScanErr 1055 } else { 1056 c.stateLock.Lock() 1057 c.state.LastError = αScanErr.Error() 1058 c.stateLock.Unlock() 1059 } 1060 } 1061 if βScanErr != nil { 1062 βScanErr = fmt.Errorf("beta scan error: %w", βScanErr) 1063 if !βTryAgain { 1064 return βScanErr 1065 } else { 1066 c.stateLock.Lock() 1067 c.state.LastError = βScanErr.Error() 1068 c.stateLock.Unlock() 1069 } 1070 } 1071 1072 // Watch for retry recommendations from scan operations. These occur 1073 // when a scan fails and concurrent modifications are suspected as the 1074 // culprit. In these cases, we force another synchronization cycle. Note 1075 // that, because we skip polling, our flush request, if any, will still 1076 // be valid, and we'll be able to respond to it once a successful 1077 // synchronization cycle completes. 1078 // 1079 // TODO: Should we eventually abort synchronization after a certain 1080 // number of consecutive scan retries? 1081 if αTryAgain || βTryAgain { 1082 // If we're already in a synchronization cycle that was forced due 1083 // to a previous scan error, and we've now received another retry 1084 // recommendation, then wait before attempting a rescan. 1085 if skippingPollingDueToScanError { 1086 // Update status to waiting for rescan. 1087 c.stateLock.Lock() 1088 c.state.Status = Status_WaitingForRescan 1089 c.stateLock.Unlock() 1090 1091 // Wait before trying to rescan, but watch for cancellation. 1092 select { 1093 case <-time.After(rescanWaitDuration): 1094 case <-ctx.Done(): 1095 return errors.New("cancelled during rescan wait") 1096 } 1097 } 1098 1099 // Retry. 1100 skipPolling = true 1101 skippingPollingDueToScanError = true 1102 continue 1103 } 1104 skippingPollingDueToScanError = false 1105 1106 // Extract contents. 1107 αContent := αSnapshot.Content 1108 βContent := βSnapshot.Content 1109 if c.logger.Level() >= logging.LevelTrace { 1110 c.logger.Tracef("Ancestor contains %d entries, alpha contains %d entries, beta contains %d entries", 1111 ancestor.Count(), αContent.Count(), βContent.Count(), 1112 ) 1113 } 1114 1115 // If we're using Docker-style ignore syntax and semantics, then 1116 // snapshots may include phantom directories. In this case, we need to 1117 // perform a pre-processing step to reify these directories to either 1118 // tracked or ignored. 1119 αDirectoryCount := αSnapshot.Directories 1120 βDirectoryCount := βSnapshot.Directories 1121 if ignoreSyntax == ignore.Syntax_SyntaxDocker { 1122 αContent, βContent, αDirectoryCount, βDirectoryCount = core.ReifyPhantomDirectories( 1123 ancestor, αContent, βContent, 1124 ) 1125 } 1126 1127 // Now that we've had a successful scan, clear the last error (if any), 1128 // record scan statistics and problems (if any), and update the status 1129 // to reconciling. 1130 // 1131 // We know that it's okay to clear the error here (if there is one) 1132 // because we know that it originated from scan (since all other errors 1133 // are terminal and any previous terminal error would have been cleared 1134 // at the start of this function). 1135 c.stateLock.Lock() 1136 c.state.LastError = "" 1137 c.state.AlphaState.Scanned = true 1138 c.state.AlphaState.Directories = αDirectoryCount 1139 c.state.AlphaState.Files = αSnapshot.Files 1140 c.state.AlphaState.SymbolicLinks = αSnapshot.SymbolicLinks 1141 c.state.AlphaState.TotalFileSize = αSnapshot.TotalFileSize 1142 c.state.AlphaState.ScanProblems = αContent.Problems() 1143 c.state.BetaState.Scanned = true 1144 c.state.BetaState.Directories = βDirectoryCount 1145 c.state.BetaState.Files = βSnapshot.Files 1146 c.state.BetaState.SymbolicLinks = βSnapshot.SymbolicLinks 1147 c.state.BetaState.TotalFileSize = βSnapshot.TotalFileSize 1148 c.state.BetaState.ScanProblems = βContent.Problems() 1149 c.state.Status = Status_Reconciling 1150 c.stateLock.Unlock() 1151 1152 // If we're propagating executability bits and one endpoint preserves 1153 // executability information while the the other does not, then 1154 // propagate executability information from the preserving side to the 1155 // non-preserving side. We only do this if the corresponding target 1156 // content is non-nil, because (a) PropagateExecutability is a no-op if 1157 // it is nil and (b) PreservesExecutability will have defaulted to false 1158 // if there's no content and (even though this will be a no-op) we don't 1159 // want the spurious logs. 1160 if permissionsMode == core.PermissionsMode_PermissionsModePortable { 1161 if αSnapshot.PreservesExecutability && βContent != nil && !βSnapshot.PreservesExecutability { 1162 c.logger.Debug("Propagating alpha executability to beta") 1163 βContent = core.PropagateExecutability(ancestor, αContent, βContent) 1164 } else if βSnapshot.PreservesExecutability && αContent != nil && !αSnapshot.PreservesExecutability { 1165 c.logger.Debug("Propagating beta executability to alpha") 1166 αContent = core.PropagateExecutability(ancestor, βContent, αContent) 1167 } 1168 } 1169 1170 // Check if the root is a directory that's been emptied (by deleting a 1171 // non-trivial amount of content) on one endpoint (but not both). This 1172 // can be intentional, but usually indicates that a non-persistent 1173 // filesystem (such as a container filesystem) is being used as the 1174 // synchronization root. In any case, we switch to a halted state and 1175 // wait for the user to either manually propagate the deletion and 1176 // resume the session, recreate the session, or reset the session. 1177 if oneEndpointEmptiedRoot(ancestor, αContent, βContent) { 1178 c.stateLock.Lock() 1179 c.state.Status = Status_HaltedOnRootEmptied 1180 c.stateLock.Unlock() 1181 return errHaltedForSafety 1182 } 1183 1184 // Perform reconciliation. 1185 c.logger.Debug("Performing reconciliation") 1186 ancestorChanges, αTransitions, βTransitions, conflicts := core.Reconcile( 1187 ancestor, 1188 αContent, 1189 βContent, 1190 synchronizationMode, 1191 ) 1192 if c.logger.Level() >= logging.LevelTrace { 1193 for _, change := range ancestorChanges { 1194 c.logger.Tracef("Ancestor change at \"%s\" to %s", 1195 formatPathForLogging(change.Path), 1196 formatEntryForLogging(change.New), 1197 ) 1198 } 1199 for _, transition := range αTransitions { 1200 c.logger.Tracef("Alpha transition at \"%s\" from %s to %s", 1201 formatPathForLogging(transition.Path), 1202 formatEntryForLogging(transition.Old), 1203 formatEntryForLogging(transition.New), 1204 ) 1205 } 1206 for _, transition := range βTransitions { 1207 c.logger.Tracef("Beta transition at \"%s\" from %s to %s", 1208 formatPathForLogging(transition.Path), 1209 formatEntryForLogging(transition.Old), 1210 formatEntryForLogging(transition.New), 1211 ) 1212 } 1213 for _, conflict := range conflicts { 1214 c.logger.Tracef("Conflict rooted at \"%s\"", 1215 formatPathForLogging(conflict.Root), 1216 ) 1217 } 1218 } 1219 1220 // Store conflicts that arose during reconciliation. 1221 c.stateLock.Lock() 1222 c.state.Conflicts = conflicts 1223 c.stateLock.Unlock() 1224 1225 // Check if a root deletion operation is being propagated. This can be 1226 // intentional, accidental, or an indication of a non-persistent 1227 // filesystem (such as a container filesystem). In any case, we switch 1228 // to a halted state and wait for the user to either manually propagate 1229 // the deletion and resume the session, recreate the session, or reset 1230 // the session. 1231 if containsRootDeletion(αTransitions) || containsRootDeletion(βTransitions) { 1232 c.stateLock.Lock() 1233 c.state.Status = Status_HaltedOnRootDeletion 1234 c.stateLock.Unlock() 1235 return errHaltedForSafety 1236 } 1237 1238 // Check if a root type change is being propagated. This can be 1239 // intentional or accidental. In any case, we switch to a halted state 1240 // and wait for the user to manually delete the content that will be 1241 // overwritten by the type change and resume the session. 1242 if containsRootTypeChange(αTransitions) || containsRootTypeChange(βTransitions) { 1243 c.stateLock.Lock() 1244 c.state.Status = Status_HaltedOnRootTypeChange 1245 c.stateLock.Unlock() 1246 return errHaltedForSafety 1247 } 1248 1249 // Stage files on alpha. 1250 c.stateLock.Lock() 1251 c.state.Status = Status_StagingAlpha 1252 c.stateLock.Unlock() 1253 if paths, digests := core.TransitionDependencies(αTransitions); len(paths) > 0 { 1254 c.logger.Debugf("Staging %d file(s) on alpha", len(paths)) 1255 filteredPaths, signatures, receiver, err := alpha.Stage(paths, digests) 1256 if err != nil { 1257 return fmt.Errorf("unable to begin staging on alpha: %w", err) 1258 } 1259 if !filteredPathsAreSubset(filteredPaths, paths) { 1260 return errors.New("alpha returned incorrect subset of staging paths") 1261 } 1262 if len(filteredPaths) < len(paths) { 1263 c.logger.Debugf("Alpha pre-staged %d/%d files", len(paths)-len(filteredPaths), len(paths)) 1264 } 1265 if len(filteredPaths) > 0 { 1266 monitor := func(state *rsync.ReceiverState) error { 1267 c.stateLock.Lock() 1268 if state == nil { 1269 c.state.AlphaState.StagingProgress = nil 1270 } else { 1271 if c.state.AlphaState.StagingProgress == nil { 1272 c.state.AlphaState.StagingProgress = &rsync.ReceiverState{} 1273 } 1274 proto.Merge(c.state.AlphaState.StagingProgress, state) 1275 } 1276 c.stateLock.Unlock() 1277 return nil 1278 } 1279 receiver = rsync.NewMonitoringReceiver(receiver, filteredPaths, signatures, monitor) 1280 receiver = rsync.NewPreemptableReceiver(ctx, receiver) 1281 if err = beta.Supply(filteredPaths, signatures, receiver); err != nil { 1282 return fmt.Errorf("unable to stage files on alpha: %w", err) 1283 } 1284 } 1285 } 1286 1287 // Stage files on beta. 1288 c.stateLock.Lock() 1289 c.state.Status = Status_StagingBeta 1290 c.stateLock.Unlock() 1291 if paths, digests := core.TransitionDependencies(βTransitions); len(paths) > 0 { 1292 c.logger.Debugf("Staging %d file(s) on beta", len(paths)) 1293 filteredPaths, signatures, receiver, err := beta.Stage(paths, digests) 1294 if err != nil { 1295 return fmt.Errorf("unable to begin staging on beta: %w", err) 1296 } 1297 if !filteredPathsAreSubset(filteredPaths, paths) { 1298 return errors.New("beta returned incorrect subset of staging paths") 1299 } 1300 if len(filteredPaths) < len(paths) { 1301 c.logger.Debugf("Beta pre-staged %d/%d files", len(paths)-len(filteredPaths), len(paths)) 1302 } 1303 if len(filteredPaths) > 0 { 1304 monitor := func(state *rsync.ReceiverState) error { 1305 c.stateLock.Lock() 1306 if state == nil { 1307 c.state.BetaState.StagingProgress = nil 1308 } else { 1309 if c.state.BetaState.StagingProgress == nil { 1310 c.state.BetaState.StagingProgress = &rsync.ReceiverState{} 1311 } 1312 proto.Merge(c.state.BetaState.StagingProgress, state) 1313 } 1314 c.stateLock.Unlock() 1315 return nil 1316 } 1317 receiver = rsync.NewMonitoringReceiver(receiver, filteredPaths, signatures, monitor) 1318 receiver = rsync.NewPreemptableReceiver(ctx, receiver) 1319 if err = alpha.Supply(filteredPaths, signatures, receiver); err != nil { 1320 return fmt.Errorf("unable to stage files on beta: %w", err) 1321 } 1322 } 1323 } 1324 1325 // Perform transitions on both endpoints in parallel. For each side that 1326 // doesn't completely error out, convert its results to ancestor 1327 // changes. Transition errors are checked later, once the ancestor has 1328 // been updated. 1329 c.stateLock.Lock() 1330 c.state.Status = Status_Transitioning 1331 c.stateLock.Unlock() 1332 var αResults, βResults []*core.Entry 1333 var αProblems, βProblems []*core.Problem 1334 var αMissingFiles, βMissingFiles bool 1335 var αTransitionErr, βTransitionErr error 1336 var αChanges, βChanges []*core.Change 1337 transitionDone := &sync.WaitGroup{} 1338 if len(αTransitions) > 0 { 1339 transitionDone.Add(1) 1340 } 1341 if len(βTransitions) > 0 { 1342 transitionDone.Add(1) 1343 } 1344 if len(αTransitions) > 0 { 1345 c.logger.Debug("Transitioning alpha") 1346 go func() { 1347 αResults, αProblems, αMissingFiles, αTransitionErr = alpha.Transition(ctx, αTransitions) 1348 if αTransitionErr == nil { 1349 for t, transition := range αTransitions { 1350 αChanges = append(αChanges, &core.Change{Path: transition.Path, New: αResults[t]}) 1351 } 1352 } 1353 transitionDone.Done() 1354 }() 1355 } 1356 if len(βTransitions) > 0 { 1357 c.logger.Debug("Transitioning beta") 1358 go func() { 1359 βResults, βProblems, βMissingFiles, βTransitionErr = beta.Transition(ctx, βTransitions) 1360 if βTransitionErr == nil { 1361 for t, transition := range βTransitions { 1362 βChanges = append(βChanges, &core.Change{Path: transition.Path, New: βResults[t]}) 1363 } 1364 } 1365 transitionDone.Done() 1366 }() 1367 } 1368 transitionDone.Wait() 1369 1370 // Record transition problems. 1371 c.stateLock.Lock() 1372 c.state.Status = Status_Saving 1373 c.state.AlphaState.TransitionProblems = αProblems 1374 c.state.BetaState.TransitionProblems = βProblems 1375 c.stateLock.Unlock() 1376 1377 // Fold applied changes into the ancestor's change list and update the 1378 // ancestor if any changes are present. 1379 ancestorChanges = append(ancestorChanges, αChanges...) 1380 ancestorChanges = append(ancestorChanges, βChanges...) 1381 if len(ancestorChanges) > 0 { 1382 // Apply the changes to the ancestor. 1383 if newAncestor, err := core.Apply(ancestor, ancestorChanges); err != nil { 1384 return fmt.Errorf("unable to propagate changes to ancestor: %w", err) 1385 } else { 1386 ancestor = newAncestor 1387 } 1388 1389 // Validate the new ancestor before saving it to ensure that our 1390 // reconciliation logic doesn't have any flaws. This is the only time 1391 // that we validate a data structure generated by code in the same 1392 // process (usually our tests are our validation), but this case is 1393 // special because (a) our test cases can't cover every real world 1394 // condition that might arise and (b) if we write a broken ancestor to 1395 // disk, the session is toast. This safety check ensures that even if we 1396 // put out a broken release, or encounter some bizarre real world merge 1397 // case that we didn't consider, things can be fixed. 1398 if err := ancestor.EnsureValid(true); err != nil { 1399 return fmt.Errorf("new ancestor is invalid: %w", err) 1400 } 1401 1402 // Save the ancestor. 1403 c.logger.Debug("Saving ancestor") 1404 archive.Content = ancestor 1405 if err := encoding.MarshalAndSaveProtobuf(c.archivePath, archive); err != nil { 1406 return fmt.Errorf("unable to save ancestor: %w", err) 1407 } 1408 } 1409 1410 // Now check for transition errors. 1411 if αTransitionErr != nil { 1412 return fmt.Errorf("unable to apply changes to alpha: %w", αTransitionErr) 1413 } else if βTransitionErr != nil { 1414 return fmt.Errorf("unable to apply changes to beta: %w", βTransitionErr) 1415 } 1416 1417 // If there were files missing from either endpoint's stager during the 1418 // transition operations, then there were likely concurrent 1419 // modifications during staging. If we see this, then skip polling and 1420 // attempt to run another synchronization cycle immediately, but only if 1421 // we're not already in a synchronization cycle that was forced due to 1422 // previously missing files. 1423 if (αMissingFiles || βMissingFiles) && !skippingPollingDueToMissingFiles { 1424 c.logger.Debug("Endpoint(s) missing files after transition, skipping polling") 1425 skipPolling = true 1426 skippingPollingDueToMissingFiles = true 1427 } else { 1428 skippingPollingDueToMissingFiles = false 1429 } 1430 1431 // Increment the synchronization cycle count. 1432 c.stateLock.Lock() 1433 c.state.SuccessfulCycles++ 1434 c.stateLock.Unlock() 1435 1436 // If a flush request triggered this synchronization cycle, then tell it 1437 // that the cycle has completed and remove it from our tracking. 1438 if flushRequest != nil { 1439 flushRequest <- nil 1440 flushRequest = nil 1441 } 1442 } 1443 }