github.com/mutagen-io/mutagen@v0.18.0-rc1/pkg/synchronization/controller.go

github.com/mutagen-io/mutagen@v0.18.0-rc1/pkg/synchronization/controller.go (about)

     1  package synchronization
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"os"
     8  	"sync"
     9  	"time"
    10  
    11  	"google.golang.org/protobuf/proto"
    12  	"google.golang.org/protobuf/types/known/timestamppb"
    13  
    14  	"github.com/mutagen-io/mutagen/pkg/encoding"
    15  	"github.com/mutagen-io/mutagen/pkg/logging"
    16  	"github.com/mutagen-io/mutagen/pkg/mutagen"
    17  	"github.com/mutagen-io/mutagen/pkg/prompting"
    18  	"github.com/mutagen-io/mutagen/pkg/state"
    19  	"github.com/mutagen-io/mutagen/pkg/synchronization/core"
    20  	"github.com/mutagen-io/mutagen/pkg/synchronization/core/ignore"
    21  	"github.com/mutagen-io/mutagen/pkg/synchronization/rsync"
    22  	"github.com/mutagen-io/mutagen/pkg/url"
    23  )
    24  
    25  const (
    26  	// autoReconnectInterval is the period of time to wait before attempting an
    27  	// automatic reconnect after disconnection or a failed reconnect.
    28  	autoReconnectInterval = 15 * time.Second
    29  	// rescanWaitDuration is the period of time to wait before attempting to
    30  	// rescan after an ephemeral scan failure.
    31  	rescanWaitDuration = 5 * time.Second
    32  )
    33  
    34  // controller manages and executes a single session.
    35  type controller struct {
    36  	// logger is the controller logger.
    37  	logger *logging.Logger
    38  	// sessionPath is the path to the serialized session.
    39  	sessionPath string
    40  	// archivePath is the path to the serialized archive.
    41  	archivePath string
    42  	// stateLock guards and tracks changes to session's Paused field, state, and
    43  	// synchronizing. Previous holders may continue to poll on synchronizing if
    44  	// they store it in a separate variable before releasing the lock.
    45  	stateLock *state.TrackingLock
    46  	// session encodes the associated session metadata. It is considered static
    47  	// and safe for concurrent access except for its Paused field, for which
    48  	// stateLock should be held. It should be saved to disk any time it is
    49  	// modified.
    50  	session *Session
    51  	// mergedAlphaConfiguration is the alpha-specific configuration object
    52  	// (computed from the core configuration and alpha-specific overrides). It
    53  	// is considered static and safe for concurrent access. It is a derived
    54  	// field and not saved to disk.
    55  	mergedAlphaConfiguration *Configuration
    56  	// mergedBetaConfiguration is the beta-specific configuration object
    57  	// (computed from the core configuration and beta-specific overrides). It is
    58  	// considered static and safe for concurrent access. It is a derived field
    59  	// and not saved to disk.
    60  	mergedBetaConfiguration *Configuration
    61  	// state represents the current synchronization state.
    62  	state *State
    63  	// synchronizing is used to track whether or not the synchronization loop is
    64  	// currently in a state where it is capable of performing synchronization.
    65  	// It is non-nil if and only if the synchronization loop is connected and in
    66  	// a state where it can perform synchronization. It is closed when
    67  	// synchronization fails due to an error.
    68  	synchronizing chan struct{}
    69  	// lifecycleLock guards access to disabled, cancel, flushRequests, and done.
    70  	// Only the current holder of the lifecycle lock may set any of these fields
    71  	// or invoke cancel. The synchronization loop may close close done or
    72  	// receive from flushRequests without holding the lifecycle lock. Moreover,
    73  	// previous lifecycle lock holders may continue to send to flushRequests and
    74  	// poll on done after storing them in separate variables and releasing the
    75  	// lifecycle lock. Any code wishing to set these fields must first acquire
    76  	// the lock, then cancel the synchronization loop and wait for it to
    77  	// complete before making any changes.
    78  	lifecycleLock sync.Mutex
    79  	// disabled indicates that no more changes to the synchronization loop
    80  	// lifecycle are allowed (i.e. no more synchronization loops can be started
    81  	// for this controller). This is used by terminate and shutdown. It should
    82  	// only be set to true once any existing synchronization loop has been
    83  	// stopped.
    84  	disabled bool
    85  	// cancel cancels the synchronization loop execution context. It is nil if
    86  	// and only if there is no synchronization loop running.
    87  	cancel context.CancelFunc
    88  	// flushRequests is used pass flush requests to the synchronization loop. It
    89  	// is buffered, allowing a single request to be queued. All requests passed
    90  	// via this channel must be buffered and contain room for one error.
    91  	flushRequests chan chan error
    92  	// done will be closed by the current synchronization loop when it exits.
    93  	done chan struct{}
    94  }
    95  
    96  // newSession creates a new session and corresponding controller.
    97  func newSession(
    98  	ctx context.Context,
    99  	logger *logging.Logger,
   100  	tracker *state.Tracker,
   101  	identifier string,
   102  	alpha, beta *url.URL,
   103  	configuration, configurationAlpha, configurationBeta *Configuration,
   104  	name string,
   105  	labels map[string]string,
   106  	paused bool,
   107  	prompter string,
   108  ) (*controller, error) {
   109  	// Update status.
   110  	prompting.Message(prompter, "Creating session...")
   111  
   112  	// Set the session version.
   113  	version := DefaultVersion
   114  
   115  	// Compute the creation time and check that it's valid for Protocol Buffers.
   116  	creationTime := timestamppb.Now()
   117  	if err := creationTime.CheckValid(); err != nil {
   118  		return nil, fmt.Errorf("unable to record creation time: %w", err)
   119  	}
   120  
   121  	// Compute merged endpoint configurations.
   122  	mergedAlphaConfiguration := MergeConfigurations(configuration, configurationAlpha)
   123  	mergedBetaConfiguration := MergeConfigurations(configuration, configurationBeta)
   124  
   125  	// If the session isn't being created paused, then try to connect to the
   126  	// endpoints. Before doing so, set up a deferred handler that will shut down
   127  	// any endpoints that aren't handed off to the run loop due to errors.
   128  	var alphaEndpoint, betaEndpoint Endpoint
   129  	var err error
   130  	defer func() {
   131  		if alphaEndpoint != nil {
   132  			alphaEndpoint.Shutdown()
   133  			alphaEndpoint = nil
   134  		}
   135  		if betaEndpoint != nil {
   136  			betaEndpoint.Shutdown()
   137  			betaEndpoint = nil
   138  		}
   139  	}()
   140  	if !paused {
   141  		logger.Info("Connecting to alpha endpoint")
   142  		alphaEndpoint, err = connect(
   143  			ctx,
   144  			logger.Sublogger("alpha"),
   145  			alpha,
   146  			prompter,
   147  			identifier,
   148  			version,
   149  			mergedAlphaConfiguration,
   150  			true,
   151  		)
   152  		if err != nil {
   153  			logger.Info("Alpha connection failure:", err)
   154  			return nil, fmt.Errorf("unable to connect to alpha: %w", err)
   155  		}
   156  		logger.Info("Connecting to beta endpoint")
   157  		betaEndpoint, err = connect(
   158  			ctx,
   159  			logger.Sublogger("beta"),
   160  			beta,
   161  			prompter,
   162  			identifier,
   163  			version,
   164  			mergedBetaConfiguration,
   165  			false,
   166  		)
   167  		if err != nil {
   168  			logger.Info("Beta connection failure:", err)
   169  			return nil, fmt.Errorf("unable to connect to beta: %w", err)
   170  		}
   171  	}
   172  
   173  	// Create the session and initial archive.
   174  	session := &Session{
   175  		Identifier:           identifier,
   176  		Version:              version,
   177  		CreationTime:         creationTime,
   178  		CreatingVersionMajor: mutagen.VersionMajor,
   179  		CreatingVersionMinor: mutagen.VersionMinor,
   180  		CreatingVersionPatch: mutagen.VersionPatch,
   181  		Alpha:                alpha,
   182  		Beta:                 beta,
   183  		Configuration:        configuration,
   184  		ConfigurationAlpha:   configurationAlpha,
   185  		ConfigurationBeta:    configurationBeta,
   186  		Name:                 name,
   187  		Labels:               labels,
   188  		Paused:               paused,
   189  	}
   190  	archive := &core.Archive{}
   191  
   192  	// Compute the session and archive paths.
   193  	sessionPath, err := pathForSession(session.Identifier)
   194  	if err != nil {
   195  		return nil, fmt.Errorf("unable to compute session path: %w", err)
   196  	}
   197  	archivePath, err := pathForArchive(session.Identifier)
   198  	if err != nil {
   199  		return nil, fmt.Errorf("unable to compute archive path: %w", err)
   200  	}
   201  
   202  	// Save components to disk.
   203  	if err := encoding.MarshalAndSaveProtobuf(sessionPath, session); err != nil {
   204  		return nil, fmt.Errorf("unable to save session: %w", err)
   205  	}
   206  	if err := encoding.MarshalAndSaveProtobuf(archivePath, archive); err != nil {
   207  		os.Remove(sessionPath)
   208  		return nil, fmt.Errorf("unable to save archive: %w", err)
   209  	}
   210  
   211  	// Create the controller.
   212  	controller := &controller{
   213  		logger:                   logger,
   214  		sessionPath:              sessionPath,
   215  		archivePath:              archivePath,
   216  		stateLock:                state.NewTrackingLock(tracker),
   217  		session:                  session,
   218  		mergedAlphaConfiguration: mergedAlphaConfiguration,
   219  		mergedBetaConfiguration:  mergedBetaConfiguration,
   220  		state: &State{
   221  			Session:    session,
   222  			AlphaState: &EndpointState{},
   223  			BetaState:  &EndpointState{},
   224  		},
   225  	}
   226  
   227  	// If the session isn't being created paused, then start a synchronization
   228  	// loop and mark the endpoints as handed off to that loop so that we don't
   229  	// defer their shutdown.
   230  	if !paused {
   231  		ctx, cancel := context.WithCancel(context.Background())
   232  		controller.cancel = cancel
   233  		controller.flushRequests = make(chan chan error, 1)
   234  		controller.done = make(chan struct{})
   235  		go controller.run(ctx, alphaEndpoint, betaEndpoint)
   236  		alphaEndpoint = nil
   237  		betaEndpoint = nil
   238  	}
   239  
   240  	// Success.
   241  	logger.Info("Session initialized")
   242  	return controller, nil
   243  }
   244  
   245  // loadSession loads an existing session and creates a corresponding controller.
   246  func loadSession(logger *logging.Logger, tracker *state.Tracker, identifier string) (*controller, error) {
   247  	// Compute session and archive paths.
   248  	sessionPath, err := pathForSession(identifier)
   249  	if err != nil {
   250  		return nil, fmt.Errorf("unable to compute session path: %w", err)
   251  	}
   252  	archivePath, err := pathForArchive(identifier)
   253  	if err != nil {
   254  		return nil, fmt.Errorf("unable to compute archive path: %w", err)
   255  	}
   256  
   257  	// Load and validate the session. We have to populate a few optional fields
   258  	// before validation if they're not set. We can't do this in the Session
   259  	// literal because they'll be wiped out during unmarshalling, even if not
   260  	// set.
   261  	session := &Session{}
   262  	if err := encoding.LoadAndUnmarshalProtobuf(sessionPath, session); err != nil {
   263  		return nil, fmt.Errorf("unable to load session configuration: %w", err)
   264  	}
   265  	if session.ConfigurationAlpha == nil {
   266  		session.ConfigurationAlpha = &Configuration{}
   267  	}
   268  	if session.ConfigurationBeta == nil {
   269  		session.ConfigurationBeta = &Configuration{}
   270  	}
   271  	if err := session.EnsureValid(); err != nil {
   272  		return nil, fmt.Errorf("invalid session found on disk: %w", err)
   273  	}
   274  
   275  	// Create the controller.
   276  	controller := &controller{
   277  		logger:      logger,
   278  		sessionPath: sessionPath,
   279  		archivePath: archivePath,
   280  		stateLock:   state.NewTrackingLock(tracker),
   281  		session:     session,
   282  		mergedAlphaConfiguration: MergeConfigurations(
   283  			session.Configuration,
   284  			session.ConfigurationAlpha,
   285  		),
   286  		mergedBetaConfiguration: MergeConfigurations(
   287  			session.Configuration,
   288  			session.ConfigurationBeta,
   289  		),
   290  		state: &State{
   291  			Session:    session,
   292  			AlphaState: &EndpointState{},
   293  			BetaState:  &EndpointState{},
   294  		},
   295  	}
   296  
   297  	// If the session isn't marked as paused, start a synchronization loop.
   298  	if !session.Paused {
   299  		ctx, cancel := context.WithCancel(context.Background())
   300  		controller.cancel = cancel
   301  		controller.flushRequests = make(chan chan error, 1)
   302  		controller.done = make(chan struct{})
   303  		go controller.run(ctx, nil, nil)
   304  	}
   305  
   306  	// Success.
   307  	logger.Info("Session loaded")
   308  	return controller, nil
   309  }
   310  
   311  // currentState creates a static snapshot of the current session state.
   312  func (c *controller) currentState() *State {
   313  	// Lock the session state and defer its release. It's very important that we
   314  	// unlock without a notification here, otherwise we'd trigger an infinite
   315  	// cycle of list/notify.
   316  	c.stateLock.Lock()
   317  	defer c.stateLock.UnlockWithoutNotify()
   318  
   319  	// Create a static copy of the state.
   320  	return proto.Clone(c.state).(*State)
   321  }
   322  
   323  // flush attempts to force a synchronization cycle for the session. If wait is
   324  // specified, then the method will wait until a post-flush synchronization cycle
   325  // has completed. The provided context (which must be non-nil) can terminate
   326  // this wait early.
   327  func (c *controller) flush(ctx context.Context, prompter string, skipWait bool) error {
   328  	// Update status.
   329  	prompting.Message(prompter, fmt.Sprintf("Forcing synchronization cycle for session %s...", c.session.Identifier))
   330  
   331  	// Lock the controller's lifecycle.
   332  	c.lifecycleLock.Lock()
   333  
   334  	// Don't allow any operations if the controller is disabled.
   335  	if c.disabled {
   336  		c.lifecycleLock.Unlock()
   337  		return errors.New("controller disabled")
   338  	}
   339  
   340  	// Check if the session is paused.
   341  	if c.cancel == nil {
   342  		c.lifecycleLock.Unlock()
   343  		return errors.New("session is paused")
   344  	}
   345  
   346  	// Perform logging.
   347  	c.logger.Infof("Forcing synchronization cycle")
   348  
   349  	// Check if the session is currently synchronizing and store the channel
   350  	// that we'll use to track synchronizability.
   351  	c.stateLock.Lock()
   352  	synchronizing := c.synchronizing
   353  	c.stateLock.UnlockWithoutNotify()
   354  	if synchronizing == nil {
   355  		c.lifecycleLock.Unlock()
   356  		return errors.New("session is not currently able to synchronize")
   357  	}
   358  
   359  	// Store the channels that we'll need to submit flush requests and track
   360  	// synchronization termination.
   361  	flushRequests := c.flushRequests
   362  	done := c.done
   363  
   364  	// Release the lifecycle lock.
   365  	c.lifecycleLock.Unlock()
   366  
   367  	// Create a flush request.
   368  	request := make(chan error, 1)
   369  
   370  	// If we don't want to wait, then we can simply send the request in a
   371  	// non-blocking manner, in which case either this request (or one that's
   372  	// already queued) will be processed eventually. After that, we're done. In
   373  	// this case, we'll still check for an inability to synchronize, since we
   374  	// may as well report it if we can.
   375  	if skipWait {
   376  		select {
   377  		case flushRequests <- request:
   378  			return nil
   379  		case <-synchronizing:
   380  			return errors.New("synchronization failed before flush request could be sent")
   381  		case <-done:
   382  			return errors.New("synchronization terminated before flush request could be sent")
   383  		default:
   384  			return nil
   385  		}
   386  	}
   387  
   388  	// Otherwise we need to send the request in a blocking manner, watching for
   389  	// cancellation, failure, or termination.
   390  	select {
   391  	case flushRequests <- request:
   392  	case <-ctx.Done():
   393  		return errors.New("flush cancelled before request could be sent")
   394  	case <-synchronizing:
   395  		return errors.New("synchronization failed before flush request could be sent")
   396  	case <-done:
   397  		return errors.New("synchronization terminated before flush request could be sent")
   398  	}
   399  
   400  	// Now we need to wait for a response to the request, again watching for
   401  	// cancellation, failure, or termination.
   402  	select {
   403  	case err := <-request:
   404  		return err
   405  	case <-ctx.Done():
   406  		return errors.New("flush cancelled while waiting for response")
   407  	case <-synchronizing:
   408  		return errors.New("synchronization failed while waiting for flush response")
   409  	case <-done:
   410  		return errors.New("synchronization terminated while waiting for flush response")
   411  	}
   412  }
   413  
   414  // resume attempts to reconnect and resume the session if it isn't currently
   415  // connected and synchronizing. If lifecycleLockHeld is true, then halt will
   416  // assume that the lifecycle lock is held by the caller and will not attempt to
   417  // acquire it.
   418  func (c *controller) resume(ctx context.Context, prompter string, lifecycleLockHeld bool) error {
   419  	// Update status.
   420  	prompting.Message(prompter, fmt.Sprintf("Resuming session %s...", c.session.Identifier))
   421  
   422  	// If not already held, acquire the lifecycle lock and defer its release.
   423  	if !lifecycleLockHeld {
   424  		c.lifecycleLock.Lock()
   425  		defer c.lifecycleLock.Unlock()
   426  	}
   427  
   428  	// Don't allow any resume operations if the controller is disabled.
   429  	if c.disabled {
   430  		return errors.New("controller disabled")
   431  	}
   432  
   433  	// Perform logging.
   434  	c.logger.Infof("Resuming")
   435  
   436  	// Check if there's an existing synchronization loop (i.e. if the session is
   437  	// unpaused).
   438  	if c.cancel != nil {
   439  		// If there is an existing synchronization loop, check if it's already
   440  		// in a state that's considered "connected".
   441  		c.stateLock.Lock()
   442  		connected := c.state.Status >= Status_Watching
   443  		c.stateLock.UnlockWithoutNotify()
   444  
   445  		// If we're already connected, then there's nothing we need to do. We
   446  		// don't even need to mark the session as unpaused because it can't be
   447  		// marked as paused if an existing synchronization loop is running (we
   448  		// enforce this invariant as part of the controller's logic).
   449  		if connected {
   450  			return nil
   451  		}
   452  
   453  		// Otherwise, cancel the existing synchronization loop and wait for it
   454  		// to finish.
   455  		//
   456  		// There's something of an efficiency race condition here, because the
   457  		// existing loop might succeed in connecting between the time we check
   458  		// and the time we cancel it. That could happen if an auto-reconnect
   459  		// succeeds or even if the loop was already passed connections and it's
   460  		// just hasn't updated its status yet. But the only danger here is
   461  		// basically wasting those connections, and the window is very small.
   462  		c.cancel()
   463  		<-c.done
   464  
   465  		// Nil out any lifecycle state.
   466  		c.cancel = nil
   467  		c.flushRequests = nil
   468  		c.done = nil
   469  	}
   470  
   471  	// Mark the session as unpaused and save it to disk.
   472  	c.stateLock.Lock()
   473  	c.session.Paused = false
   474  	saveErr := encoding.MarshalAndSaveProtobuf(c.sessionPath, c.session)
   475  	c.stateLock.Unlock()
   476  
   477  	// Attempt to connect to alpha.
   478  	c.stateLock.Lock()
   479  	c.state.Status = Status_ConnectingAlpha
   480  	c.stateLock.Unlock()
   481  	alpha, alphaConnectErr := connect(
   482  		ctx,
   483  		c.logger.Sublogger("alpha"),
   484  		c.session.Alpha,
   485  		prompter,
   486  		c.session.Identifier,
   487  		c.session.Version,
   488  		c.mergedAlphaConfiguration,
   489  		true,
   490  	)
   491  	c.stateLock.Lock()
   492  	c.state.AlphaState.Connected = (alpha != nil)
   493  	c.stateLock.Unlock()
   494  
   495  	// Attempt to connect to beta.
   496  	c.stateLock.Lock()
   497  	c.state.Status = Status_ConnectingBeta
   498  	c.stateLock.Unlock()
   499  	beta, betaConnectErr := connect(
   500  		ctx,
   501  		c.logger.Sublogger("beta"),
   502  		c.session.Beta,
   503  		prompter,
   504  		c.session.Identifier,
   505  		c.session.Version,
   506  		c.mergedBetaConfiguration,
   507  		false,
   508  	)
   509  	c.stateLock.Lock()
   510  	c.state.BetaState.Connected = (beta != nil)
   511  	c.stateLock.Unlock()
   512  
   513  	// Start the synchronization loop with what we have. Alpha or beta may have
   514  	// failed to connect (and be nil), but in any case that'll just make the run
   515  	// loop keep trying to connect.
   516  	ctx, cancel := context.WithCancel(context.Background())
   517  	c.cancel = cancel
   518  	c.flushRequests = make(chan chan error, 1)
   519  	c.done = make(chan struct{})
   520  	go c.run(ctx, alpha, beta)
   521  
   522  	// Report any errors. Since we always want to start a synchronization loop,
   523  	// even on partial or complete failure (since it might be able to
   524  	// auto-reconnect on its own), we wait until the end to report errors.
   525  	if saveErr != nil {
   526  		return fmt.Errorf("unable to save session: %w", saveErr)
   527  	} else if alphaConnectErr != nil {
   528  		return fmt.Errorf("unable to connect to alpha: %w", alphaConnectErr)
   529  	} else if betaConnectErr != nil {
   530  		return fmt.Errorf("unable to connect to beta: %w", betaConnectErr)
   531  	}
   532  
   533  	// Success.
   534  	return nil
   535  }
   536  
   537  // controllerHaltMode represents the behavior to use when halting a session.
   538  type controllerHaltMode uint8
   539  
   540  const (
   541  	// controllerHaltModePause indicates that a session should be halted and
   542  	// marked as paused.
   543  	controllerHaltModePause controllerHaltMode = iota
   544  	// controllerHaltModeShutdown indicates that a session should be halted.
   545  	controllerHaltModeShutdown
   546  	// controllerHaltModeShutdown indicates that a session should be halted and
   547  	// then deleted.
   548  	controllerHaltModeTerminate
   549  )
   550  
   551  // description returns a human-readable description of a halt mode.
   552  func (m controllerHaltMode) description() string {
   553  	switch m {
   554  	case controllerHaltModePause:
   555  		return "Pausing"
   556  	case controllerHaltModeShutdown:
   557  		return "Shutting down"
   558  	case controllerHaltModeTerminate:
   559  		return "Terminating"
   560  	default:
   561  		panic("unhandled halt mode")
   562  	}
   563  }
   564  
   565  // halt halts the session with the specified behavior. If lifecycleLockHeld is
   566  // true, then halt will assume that the lifecycle lock is held by the caller and
   567  // will not attempt to acquire it.
   568  func (c *controller) halt(_ context.Context, mode controllerHaltMode, prompter string, lifecycleLockHeld bool) error {
   569  	// Update status.
   570  	prompting.Message(prompter, fmt.Sprintf("%s session %s...", mode.description(), c.session.Identifier))
   571  
   572  	// If not already held, acquire the lifecycle lock and defer its release.
   573  	if !lifecycleLockHeld {
   574  		c.lifecycleLock.Lock()
   575  		defer c.lifecycleLock.Unlock()
   576  	}
   577  
   578  	// Don't allow any additional halt operations if the controller is disabled,
   579  	// because either this session is being terminated or the service is
   580  	// shutting down, and in either case there is no point in halting.
   581  	if c.disabled {
   582  		return errors.New("controller disabled")
   583  	}
   584  
   585  	// Perform logging.
   586  	c.logger.Infof(mode.description())
   587  
   588  	// Kill any existing synchronization loop.
   589  	if c.cancel != nil {
   590  		// Cancel the synchronization loop and wait for it to finish.
   591  		c.cancel()
   592  		<-c.done
   593  
   594  		// Nil out any lifecycle state.
   595  		c.cancel = nil
   596  		c.flushRequests = nil
   597  		c.done = nil
   598  	}
   599  
   600  	// Handle based on the halt mode.
   601  	if mode == controllerHaltModePause {
   602  		// Mark the session as paused and save it.
   603  		c.stateLock.Lock()
   604  		c.session.Paused = true
   605  		saveErr := encoding.MarshalAndSaveProtobuf(c.sessionPath, c.session)
   606  		c.stateLock.Unlock()
   607  		if saveErr != nil {
   608  			return fmt.Errorf("unable to save session: %w", saveErr)
   609  		}
   610  	} else if mode == controllerHaltModeShutdown {
   611  		// Disable the controller.
   612  		c.disabled = true
   613  	} else if mode == controllerHaltModeTerminate {
   614  		// Disable the controller.
   615  		c.disabled = true
   616  
   617  		// Wipe the session information from disk.
   618  		sessionRemoveErr := os.Remove(c.sessionPath)
   619  		archiveRemoveErr := os.Remove(c.archivePath)
   620  		if sessionRemoveErr != nil {
   621  			return fmt.Errorf("unable to remove session from disk: %w", sessionRemoveErr)
   622  		} else if archiveRemoveErr != nil {
   623  			return fmt.Errorf("unable to remove archive from disk: %w", archiveRemoveErr)
   624  		}
   625  	} else {
   626  		panic("invalid halt mode specified")
   627  	}
   628  
   629  	// Success.
   630  	return nil
   631  }
   632  
   633  // reset resets synchronization session history by pausing the session (if it's
   634  // running), overwriting the ancestor data stored on disk with an empty
   635  // ancestor, and then resuming the session (if it was previously running).
   636  func (c *controller) reset(ctx context.Context, prompter string) error {
   637  	// Lock the controller's lifecycle and defer its release.
   638  	c.lifecycleLock.Lock()
   639  	defer c.lifecycleLock.Unlock()
   640  
   641  	// Check if the session is currently running.
   642  	running := c.cancel != nil
   643  
   644  	// If the session is running, pause it.
   645  	if running {
   646  		if err := c.halt(ctx, controllerHaltModePause, prompter, true); err != nil {
   647  			return fmt.Errorf("unable to pause session: %w", err)
   648  		}
   649  	}
   650  
   651  	// Reset the session archive on disk.
   652  	c.logger.Infof("Resetting ancestor")
   653  	archive := &core.Archive{}
   654  	if err := encoding.MarshalAndSaveProtobuf(c.archivePath, archive); err != nil {
   655  		return fmt.Errorf("unable to clear session history: %w", err)
   656  	}
   657  
   658  	// Resume the session if it was previously running.
   659  	if running {
   660  		if err := c.resume(ctx, prompter, true); err != nil {
   661  			return fmt.Errorf("unable to resume session: %w", err)
   662  		}
   663  	}
   664  
   665  	// Success.
   666  	return nil
   667  }
   668  
   669  var (
   670  	// errHaltedForSafety is a sentinel error indicating that a safety check
   671  	// wants the synchronization loop to be halted until manually resumed.
   672  	errHaltedForSafety = errors.New("synchronization halted")
   673  )
   674  
   675  // run is the main run loop for the controller, managing connectivity and
   676  // synchronization.
   677  func (c *controller) run(ctx context.Context, alpha, beta Endpoint) {
   678  	// Log run loop entry.
   679  	c.logger.Debug("Run loop commencing")
   680  
   681  	// Defer resource and state cleanup.
   682  	defer func() {
   683  		// Shutdown any endpoints. These might be non-nil if the run loop was
   684  		// cancelled while partially connected rather than after sync failure.
   685  		if alpha != nil {
   686  			alpha.Shutdown()
   687  		}
   688  		if beta != nil {
   689  			beta.Shutdown()
   690  		}
   691  
   692  		// Reset the state.
   693  		c.stateLock.Lock()
   694  		c.state = &State{
   695  			Session:    c.session,
   696  			AlphaState: &EndpointState{},
   697  			BetaState:  &EndpointState{},
   698  		}
   699  		c.stateLock.Unlock()
   700  
   701  		// Log run loop termination.
   702  		c.logger.Debug("Run loop terminated")
   703  
   704  		// Signal completion.
   705  		close(c.done)
   706  	}()
   707  
   708  	// Track the last time that synchronization failed.
   709  	var lastSynchronizationFailureTime time.Time
   710  
   711  	// Loop until cancelled.
   712  	for {
   713  		// Loop until we're connected to both endpoints. We do a non-blocking
   714  		// check for cancellation on each reconnect error so that we don't waste
   715  		// resources by trying another connect when the context has been
   716  		// cancelled (it'll be wasteful). This is better than sentinel errors.
   717  		for {
   718  			// Ensure that alpha is connected.
   719  			if alpha == nil {
   720  				c.stateLock.Lock()
   721  				c.state.Status = Status_ConnectingAlpha
   722  				c.stateLock.Unlock()
   723  				alpha, _ = connect(
   724  					ctx,
   725  					c.logger.Sublogger("alpha"),
   726  					c.session.Alpha,
   727  					"",
   728  					c.session.Identifier,
   729  					c.session.Version,
   730  					c.mergedAlphaConfiguration,
   731  					true,
   732  				)
   733  			}
   734  			c.stateLock.Lock()
   735  			c.state.AlphaState.Connected = (alpha != nil)
   736  			c.stateLock.Unlock()
   737  
   738  			// Check for cancellation to avoid a spurious connection to beta in
   739  			// case cancellation occurred while connecting to alpha.
   740  			select {
   741  			case <-ctx.Done():
   742  				return
   743  			default:
   744  			}
   745  
   746  			// Ensure that beta is connected.
   747  			if beta == nil {
   748  				c.stateLock.Lock()
   749  				c.state.Status = Status_ConnectingBeta
   750  				c.stateLock.Unlock()
   751  				beta, _ = connect(
   752  					ctx,
   753  					c.logger.Sublogger("beta"),
   754  					c.session.Beta,
   755  					"",
   756  					c.session.Identifier,
   757  					c.session.Version,
   758  					c.mergedBetaConfiguration,
   759  					false,
   760  				)
   761  			}
   762  			c.stateLock.Lock()
   763  			c.state.BetaState.Connected = (beta != nil)
   764  			c.stateLock.Unlock()
   765  
   766  			// If both endpoints are connected, we're done. We perform this
   767  			// check here (rather than in the loop condition) because if we did
   768  			// it in the loop condition we'd still need a check here to avoid a
   769  			// sleep every time (even if already successfully connected).
   770  			if alpha != nil && beta != nil {
   771  				break
   772  			}
   773  
   774  			// If we failed to connect, wait and then retry. Watch for
   775  			// cancellation in the mean time.
   776  			select {
   777  			case <-ctx.Done():
   778  				return
   779  			case <-time.After(autoReconnectInterval):
   780  			}
   781  		}
   782  
   783  		// Indicate that the synchronization loop is entering a state where it
   784  		// can actually perform synchronization. We don't need to perform any
   785  		// notification here since this is not a user-visible state change.
   786  		c.stateLock.Lock()
   787  		c.synchronizing = make(chan struct{})
   788  		c.stateLock.UnlockWithoutNotify()
   789  
   790  		// Perform synchronization.
   791  		c.logger.Debug("Entering synchronization loop")
   792  		err := c.synchronize(ctx, alpha, beta)
   793  		c.logger.Debug("Synchronization loop terminated with error:", err)
   794  
   795  		// Indicate that the synchronization loop is no longer synchronizing.
   796  		// Again, no notification is required here since this is not a
   797  		// user-visible state change.
   798  		c.stateLock.Lock()
   799  		close(c.synchronizing)
   800  		c.synchronizing = nil
   801  		c.stateLock.UnlockWithoutNotify()
   802  
   803  		// Shutdown the endpoints.
   804  		alpha.Shutdown()
   805  		alpha = nil
   806  		beta.Shutdown()
   807  		beta = nil
   808  
   809  		// If synchronization failed due a halting error, then wait for the
   810  		// synchronization loop to be manually resumed.
   811  		if err == errHaltedForSafety {
   812  			<-ctx.Done()
   813  			return
   814  		}
   815  
   816  		// Otherwise, reset the synchronization state, but propagate the error
   817  		// that caused failure.
   818  		c.stateLock.Lock()
   819  		c.state = &State{
   820  			Session:    c.session,
   821  			LastError:  err.Error(),
   822  			AlphaState: &EndpointState{},
   823  			BetaState:  &EndpointState{},
   824  		}
   825  		c.stateLock.Unlock()
   826  
   827  		// If we were cancelled, then return immediately.
   828  		select {
   829  		case <-ctx.Done():
   830  			return
   831  		default:
   832  		}
   833  
   834  		// If less than one auto-reconnect interval has elapsed since the last
   835  		// synchronization failure, then wait before attempting reconnection.
   836  		now := time.Now()
   837  		if now.Sub(lastSynchronizationFailureTime) < autoReconnectInterval {
   838  			select {
   839  			case <-ctx.Done():
   840  				return
   841  			case <-time.After(autoReconnectInterval):
   842  			}
   843  		}
   844  		lastSynchronizationFailureTime = now
   845  	}
   846  }
   847  
   848  // synchronize is the main synchronization loop for the controller.
   849  func (c *controller) synchronize(ctx context.Context, alpha, beta Endpoint) error {
   850  	// Clear any error state upon restart of this function. If there was a
   851  	// terminal error previously caused synchronization to fail, then the user
   852  	// will have had time to review it (while the run loop is waiting to
   853  	// reconnect), so it's not like we're getting rid of it too quickly.
   854  	c.stateLock.Lock()
   855  	if c.state.LastError != "" {
   856  		c.state.LastError = ""
   857  		c.stateLock.Unlock()
   858  	} else {
   859  		c.stateLock.UnlockWithoutNotify()
   860  	}
   861  
   862  	// Track whether or not a flush request triggered the synchronization loop.
   863  	var flushRequest chan error
   864  
   865  	// Load the archive and extract the ancestor. We enforce that the archive
   866  	// contains only synchronizable content.
   867  	archive := &core.Archive{}
   868  	if err := encoding.LoadAndUnmarshalProtobuf(c.archivePath, archive); err != nil {
   869  		return fmt.Errorf("unable to load archive: %w", err)
   870  	} else if err = archive.EnsureValid(true); err != nil {
   871  		return fmt.Errorf("invalid archive found on disk: %w", err)
   872  	}
   873  	ancestor := archive.Content
   874  
   875  	// Compute the effective synchronization mode.
   876  	synchronizationMode := c.session.Configuration.SynchronizationMode
   877  	if synchronizationMode.IsDefault() {
   878  		synchronizationMode = c.session.Version.DefaultSynchronizationMode()
   879  	}
   880  
   881  	// Compute the effective ignore syntax.
   882  	ignoreSyntax := c.session.Configuration.IgnoreSyntax
   883  	if ignoreSyntax.IsDefault() {
   884  		ignoreSyntax = c.session.Version.DefaultIgnoreSyntax()
   885  	}
   886  
   887  	// Compute the effective permissions mode.
   888  	permissionsMode := c.session.Configuration.PermissionsMode
   889  	if permissionsMode.IsDefault() {
   890  		permissionsMode = c.session.Version.DefaultPermissionsMode()
   891  	}
   892  
   893  	// Compute, on a per-endpoint basis, whether or not polling should be
   894  	// disabled.
   895  	αWatchMode := c.mergedAlphaConfiguration.WatchMode
   896  	βWatchMode := c.mergedBetaConfiguration.WatchMode
   897  	if αWatchMode.IsDefault() {
   898  		αWatchMode = c.session.Version.DefaultWatchMode()
   899  	}
   900  	if βWatchMode.IsDefault() {
   901  		βWatchMode = c.session.Version.DefaultWatchMode()
   902  	}
   903  	αDisablePolling := (αWatchMode == WatchMode_WatchModeNoWatch)
   904  	βDisablePolling := (βWatchMode == WatchMode_WatchModeNoWatch)
   905  
   906  	// Create a switch that will allow us to skip polling and force a
   907  	// synchronization cycle. On startup, we enable this switch and skip polling
   908  	// to immediately force a check for changes that may have occurred while the
   909  	// synchronization loop wasn't running. The only time we don't force this
   910  	// check on startup is when both endpoints have polling disabled, which is
   911  	// an indication that the session should operate in a fully manual mode.
   912  	skipPolling := (!αDisablePolling || !βDisablePolling)
   913  
   914  	// Create variables to track our reasons for skipping polling.
   915  	var skippingPollingDueToScanError, skippingPollingDueToMissingFiles bool
   916  
   917  	// Loop until there is a synchronization error.
   918  	for {
   919  		// Unless we've been requested to skip polling, wait for a dirty state
   920  		// while monitoring for cancellation. If we've been requested to skip
   921  		// polling, it should only be for one iteration.
   922  		if !skipPolling {
   923  			// Update status to watching.
   924  			c.stateLock.Lock()
   925  			c.state.Status = Status_Watching
   926  			c.stateLock.Unlock()
   927  
   928  			// Create a polling context that we can cancel. We don't make it a
   929  			// subcontext of our own cancellation context because it's easier to
   930  			// just track cancellation there separately.
   931  			pollCtx, pollCancel := context.WithCancel(context.Background())
   932  
   933  			// Start alpha polling. If alpha has been put into a no-watch mode,
   934  			// then we still perform polling in order to detect transport errors
   935  			// that might occur while the session is sitting idle, but we ignore
   936  			// any non-error responses and instead wait for the polling context
   937  			// to be cancelled. We perform this ignore operation because we
   938  			// don't want a broken or malicious endpoint to be able to force
   939  			// synchronization, especially if its watching has been
   940  			// intentionally disabled.
   941  			//
   942  			// It's worth noting that, because a well-behaved endpoint in
   943  			// no-watch mode never returns events, we'll always be polling on it
   944  			// (and thereby testing the transport) right up until the polling
   945  			// context is cancelled. Thus, there's no need to worry about cases
   946  			// where the endpoint sends back an event that we ignore and then
   947  			// has a transport failure without us noticing while we wait on the
   948  			// polling context (at least not for well-behaved endpoints).
   949  			αPollResults := make(chan error, 1)
   950  			go func() {
   951  				if αDisablePolling {
   952  					if err := alpha.Poll(pollCtx); err != nil {
   953  						αPollResults <- err
   954  					} else {
   955  						<-pollCtx.Done()
   956  						αPollResults <- nil
   957  					}
   958  				} else {
   959  					αPollResults <- alpha.Poll(pollCtx)
   960  				}
   961  			}()
   962  
   963  			// Start beta polling. The logic here mirrors that for alpha above.
   964  			βPollResults := make(chan error, 1)
   965  			go func() {
   966  				if βDisablePolling {
   967  					if err := beta.Poll(pollCtx); err != nil {
   968  						βPollResults <- err
   969  					} else {
   970  						<-pollCtx.Done()
   971  						βPollResults <- nil
   972  					}
   973  				} else {
   974  					βPollResults <- beta.Poll(pollCtx)
   975  				}
   976  			}()
   977  
   978  			// Wait for either poll to return an event or an error, for a flush
   979  			// request, or for cancellation. In any of these cases, cancel
   980  			// polling and ensure that both polling operations have completed.
   981  			var αPollErr, βPollErr error
   982  			cancelled := false
   983  			select {
   984  			case αPollErr = <-αPollResults:
   985  				c.logger.Debug("Triggered by alpha endpoint")
   986  				pollCancel()
   987  				βPollErr = <-βPollResults
   988  			case βPollErr = <-βPollResults:
   989  				c.logger.Debug("Triggered by beta endpoint")
   990  				pollCancel()
   991  				αPollErr = <-αPollResults
   992  			case flushRequest = <-c.flushRequests:
   993  				if cap(flushRequest) < 1 {
   994  					panic("unbuffered flush request")
   995  				}
   996  				c.logger.Debug("Triggered by flush request")
   997  				pollCancel()
   998  				αPollErr = <-αPollResults
   999  				βPollErr = <-βPollResults
  1000  			case <-ctx.Done():
  1001  				cancelled = true
  1002  				pollCancel()
  1003  				αPollErr = <-αPollResults
  1004  				βPollErr = <-βPollResults
  1005  			}
  1006  
  1007  			// Watch for errors or cancellation.
  1008  			if cancelled {
  1009  				return errors.New("cancelled during polling")
  1010  			} else if αPollErr != nil {
  1011  				return fmt.Errorf("alpha polling error: %w", αPollErr)
  1012  			} else if βPollErr != nil {
  1013  				return fmt.Errorf("beta polling error: %w", βPollErr)
  1014  			}
  1015  		} else {
  1016  			c.logger.Debug("Skipping polling")
  1017  			skipPolling = false
  1018  		}
  1019  
  1020  		// Scan both endpoints in parallel and check for errors. If a flush
  1021  		// request is present, then force both endpoints to perform a full
  1022  		// (warm) re-scan rather than using acceleration.
  1023  		c.logger.Debug("Scanning endpoints")
  1024  		c.stateLock.Lock()
  1025  		c.state.Status = Status_Scanning
  1026  		c.stateLock.Unlock()
  1027  		forceFullScan := flushRequest != nil
  1028  		var αSnapshot, βSnapshot *core.Snapshot
  1029  		var αScanErr, βScanErr error
  1030  		var αTryAgain, βTryAgain bool
  1031  		scanDone := &sync.WaitGroup{}
  1032  		scanDone.Add(2)
  1033  		go func() {
  1034  			αSnapshot, αScanErr, αTryAgain = alpha.Scan(ctx, ancestor, forceFullScan)
  1035  			scanDone.Done()
  1036  		}()
  1037  		go func() {
  1038  			βSnapshot, βScanErr, βTryAgain = beta.Scan(ctx, ancestor, forceFullScan)
  1039  			scanDone.Done()
  1040  		}()
  1041  		scanDone.Wait()
  1042  
  1043  		// Check if cancellation occurred during scanning.
  1044  		select {
  1045  		case <-ctx.Done():
  1046  			return errors.New("cancelled during scanning")
  1047  		default:
  1048  		}
  1049  
  1050  		// Check for scan errors.
  1051  		if αScanErr != nil {
  1052  			αScanErr = fmt.Errorf("alpha scan error: %w", αScanErr)
  1053  			if !αTryAgain {
  1054  				return αScanErr
  1055  			} else {
  1056  				c.stateLock.Lock()
  1057  				c.state.LastError = αScanErr.Error()
  1058  				c.stateLock.Unlock()
  1059  			}
  1060  		}
  1061  		if βScanErr != nil {
  1062  			βScanErr = fmt.Errorf("beta scan error: %w", βScanErr)
  1063  			if !βTryAgain {
  1064  				return βScanErr
  1065  			} else {
  1066  				c.stateLock.Lock()
  1067  				c.state.LastError = βScanErr.Error()
  1068  				c.stateLock.Unlock()
  1069  			}
  1070  		}
  1071  
  1072  		// Watch for retry recommendations from scan operations. These occur
  1073  		// when a scan fails and concurrent modifications are suspected as the
  1074  		// culprit. In these cases, we force another synchronization cycle. Note
  1075  		// that, because we skip polling, our flush request, if any, will still
  1076  		// be valid, and we'll be able to respond to it once a successful
  1077  		// synchronization cycle completes.
  1078  		//
  1079  		// TODO: Should we eventually abort synchronization after a certain
  1080  		// number of consecutive scan retries?
  1081  		if αTryAgain || βTryAgain {
  1082  			// If we're already in a synchronization cycle that was forced due
  1083  			// to a previous scan error, and we've now received another retry
  1084  			// recommendation, then wait before attempting a rescan.
  1085  			if skippingPollingDueToScanError {
  1086  				// Update status to waiting for rescan.
  1087  				c.stateLock.Lock()
  1088  				c.state.Status = Status_WaitingForRescan
  1089  				c.stateLock.Unlock()
  1090  
  1091  				// Wait before trying to rescan, but watch for cancellation.
  1092  				select {
  1093  				case <-time.After(rescanWaitDuration):
  1094  				case <-ctx.Done():
  1095  					return errors.New("cancelled during rescan wait")
  1096  				}
  1097  			}
  1098  
  1099  			// Retry.
  1100  			skipPolling = true
  1101  			skippingPollingDueToScanError = true
  1102  			continue
  1103  		}
  1104  		skippingPollingDueToScanError = false
  1105  
  1106  		// Extract contents.
  1107  		αContent := αSnapshot.Content
  1108  		βContent := βSnapshot.Content
  1109  		if c.logger.Level() >= logging.LevelTrace {
  1110  			c.logger.Tracef("Ancestor contains %d entries, alpha contains %d entries, beta contains %d entries",
  1111  				ancestor.Count(), αContent.Count(), βContent.Count(),
  1112  			)
  1113  		}
  1114  
  1115  		// If we're using Docker-style ignore syntax and semantics, then
  1116  		// snapshots may include phantom directories. In this case, we need to
  1117  		// perform a pre-processing step to reify these directories to either
  1118  		// tracked or ignored.
  1119  		αDirectoryCount := αSnapshot.Directories
  1120  		βDirectoryCount := βSnapshot.Directories
  1121  		if ignoreSyntax == ignore.Syntax_SyntaxDocker {
  1122  			αContent, βContent, αDirectoryCount, βDirectoryCount = core.ReifyPhantomDirectories(
  1123  				ancestor, αContent, βContent,
  1124  			)
  1125  		}
  1126  
  1127  		// Now that we've had a successful scan, clear the last error (if any),
  1128  		// record scan statistics and problems (if any), and update the status
  1129  		// to reconciling.
  1130  		//
  1131  		// We know that it's okay to clear the error here (if there is one)
  1132  		// because we know that it originated from scan (since all other errors
  1133  		// are terminal and any previous terminal error would have been cleared
  1134  		// at the start of this function).
  1135  		c.stateLock.Lock()
  1136  		c.state.LastError = ""
  1137  		c.state.AlphaState.Scanned = true
  1138  		c.state.AlphaState.Directories = αDirectoryCount
  1139  		c.state.AlphaState.Files = αSnapshot.Files
  1140  		c.state.AlphaState.SymbolicLinks = αSnapshot.SymbolicLinks
  1141  		c.state.AlphaState.TotalFileSize = αSnapshot.TotalFileSize
  1142  		c.state.AlphaState.ScanProblems = αContent.Problems()
  1143  		c.state.BetaState.Scanned = true
  1144  		c.state.BetaState.Directories = βDirectoryCount
  1145  		c.state.BetaState.Files = βSnapshot.Files
  1146  		c.state.BetaState.SymbolicLinks = βSnapshot.SymbolicLinks
  1147  		c.state.BetaState.TotalFileSize = βSnapshot.TotalFileSize
  1148  		c.state.BetaState.ScanProblems = βContent.Problems()
  1149  		c.state.Status = Status_Reconciling
  1150  		c.stateLock.Unlock()
  1151  
  1152  		// If we're propagating executability bits and one endpoint preserves
  1153  		// executability information while the the other does not, then
  1154  		// propagate executability information from the preserving side to the
  1155  		// non-preserving side. We only do this if the corresponding target
  1156  		// content is non-nil, because (a) PropagateExecutability is a no-op if
  1157  		// it is nil and (b) PreservesExecutability will have defaulted to false
  1158  		// if there's no content and (even though this will be a no-op) we don't
  1159  		// want the spurious logs.
  1160  		if permissionsMode == core.PermissionsMode_PermissionsModePortable {
  1161  			if αSnapshot.PreservesExecutability && βContent != nil && !βSnapshot.PreservesExecutability {
  1162  				c.logger.Debug("Propagating alpha executability to beta")
  1163  				βContent = core.PropagateExecutability(ancestor, αContent, βContent)
  1164  			} else if βSnapshot.PreservesExecutability && αContent != nil && !αSnapshot.PreservesExecutability {
  1165  				c.logger.Debug("Propagating beta executability to alpha")
  1166  				αContent = core.PropagateExecutability(ancestor, βContent, αContent)
  1167  			}
  1168  		}
  1169  
  1170  		// Check if the root is a directory that's been emptied (by deleting a
  1171  		// non-trivial amount of content) on one endpoint (but not both). This
  1172  		// can be intentional, but usually indicates that a non-persistent
  1173  		// filesystem (such as a container filesystem) is being used as the
  1174  		// synchronization root. In any case, we switch to a halted state and
  1175  		// wait for the user to either manually propagate the deletion and
  1176  		// resume the session, recreate the session, or reset the session.
  1177  		if oneEndpointEmptiedRoot(ancestor, αContent, βContent) {
  1178  			c.stateLock.Lock()
  1179  			c.state.Status = Status_HaltedOnRootEmptied
  1180  			c.stateLock.Unlock()
  1181  			return errHaltedForSafety
  1182  		}
  1183  
  1184  		// Perform reconciliation.
  1185  		c.logger.Debug("Performing reconciliation")
  1186  		ancestorChanges, αTransitions, βTransitions, conflicts := core.Reconcile(
  1187  			ancestor,
  1188  			αContent,
  1189  			βContent,
  1190  			synchronizationMode,
  1191  		)
  1192  		if c.logger.Level() >= logging.LevelTrace {
  1193  			for _, change := range ancestorChanges {
  1194  				c.logger.Tracef("Ancestor change at \"%s\" to %s",
  1195  					formatPathForLogging(change.Path),
  1196  					formatEntryForLogging(change.New),
  1197  				)
  1198  			}
  1199  			for _, transition := range αTransitions {
  1200  				c.logger.Tracef("Alpha transition at \"%s\" from %s to %s",
  1201  					formatPathForLogging(transition.Path),
  1202  					formatEntryForLogging(transition.Old),
  1203  					formatEntryForLogging(transition.New),
  1204  				)
  1205  			}
  1206  			for _, transition := range βTransitions {
  1207  				c.logger.Tracef("Beta transition at \"%s\" from %s to %s",
  1208  					formatPathForLogging(transition.Path),
  1209  					formatEntryForLogging(transition.Old),
  1210  					formatEntryForLogging(transition.New),
  1211  				)
  1212  			}
  1213  			for _, conflict := range conflicts {
  1214  				c.logger.Tracef("Conflict rooted at \"%s\"",
  1215  					formatPathForLogging(conflict.Root),
  1216  				)
  1217  			}
  1218  		}
  1219  
  1220  		// Store conflicts that arose during reconciliation.
  1221  		c.stateLock.Lock()
  1222  		c.state.Conflicts = conflicts
  1223  		c.stateLock.Unlock()
  1224  
  1225  		// Check if a root deletion operation is being propagated. This can be
  1226  		// intentional, accidental, or an indication of a non-persistent
  1227  		// filesystem (such as a container filesystem). In any case, we switch
  1228  		// to a halted state and wait for the user to either manually propagate
  1229  		// the deletion and resume the session, recreate the session, or reset
  1230  		// the session.
  1231  		if containsRootDeletion(αTransitions) || containsRootDeletion(βTransitions) {
  1232  			c.stateLock.Lock()
  1233  			c.state.Status = Status_HaltedOnRootDeletion
  1234  			c.stateLock.Unlock()
  1235  			return errHaltedForSafety
  1236  		}
  1237  
  1238  		// Check if a root type change is being propagated. This can be
  1239  		// intentional or accidental. In any case, we switch to a halted state
  1240  		// and wait for the user to manually delete the content that will be
  1241  		// overwritten by the type change and resume the session.
  1242  		if containsRootTypeChange(αTransitions) || containsRootTypeChange(βTransitions) {
  1243  			c.stateLock.Lock()
  1244  			c.state.Status = Status_HaltedOnRootTypeChange
  1245  			c.stateLock.Unlock()
  1246  			return errHaltedForSafety
  1247  		}
  1248  
  1249  		// Stage files on alpha.
  1250  		c.stateLock.Lock()
  1251  		c.state.Status = Status_StagingAlpha
  1252  		c.stateLock.Unlock()
  1253  		if paths, digests := core.TransitionDependencies(αTransitions); len(paths) > 0 {
  1254  			c.logger.Debugf("Staging %d file(s) on alpha", len(paths))
  1255  			filteredPaths, signatures, receiver, err := alpha.Stage(paths, digests)
  1256  			if err != nil {
  1257  				return fmt.Errorf("unable to begin staging on alpha: %w", err)
  1258  			}
  1259  			if !filteredPathsAreSubset(filteredPaths, paths) {
  1260  				return errors.New("alpha returned incorrect subset of staging paths")
  1261  			}
  1262  			if len(filteredPaths) < len(paths) {
  1263  				c.logger.Debugf("Alpha pre-staged %d/%d files", len(paths)-len(filteredPaths), len(paths))
  1264  			}
  1265  			if len(filteredPaths) > 0 {
  1266  				monitor := func(state *rsync.ReceiverState) error {
  1267  					c.stateLock.Lock()
  1268  					if state == nil {
  1269  						c.state.AlphaState.StagingProgress = nil
  1270  					} else {
  1271  						if c.state.AlphaState.StagingProgress == nil {
  1272  							c.state.AlphaState.StagingProgress = &rsync.ReceiverState{}
  1273  						}
  1274  						proto.Merge(c.state.AlphaState.StagingProgress, state)
  1275  					}
  1276  					c.stateLock.Unlock()
  1277  					return nil
  1278  				}
  1279  				receiver = rsync.NewMonitoringReceiver(receiver, filteredPaths, signatures, monitor)
  1280  				receiver = rsync.NewPreemptableReceiver(ctx, receiver)
  1281  				if err = beta.Supply(filteredPaths, signatures, receiver); err != nil {
  1282  					return fmt.Errorf("unable to stage files on alpha: %w", err)
  1283  				}
  1284  			}
  1285  		}
  1286  
  1287  		// Stage files on beta.
  1288  		c.stateLock.Lock()
  1289  		c.state.Status = Status_StagingBeta
  1290  		c.stateLock.Unlock()
  1291  		if paths, digests := core.TransitionDependencies(βTransitions); len(paths) > 0 {
  1292  			c.logger.Debugf("Staging %d file(s) on beta", len(paths))
  1293  			filteredPaths, signatures, receiver, err := beta.Stage(paths, digests)
  1294  			if err != nil {
  1295  				return fmt.Errorf("unable to begin staging on beta: %w", err)
  1296  			}
  1297  			if !filteredPathsAreSubset(filteredPaths, paths) {
  1298  				return errors.New("beta returned incorrect subset of staging paths")
  1299  			}
  1300  			if len(filteredPaths) < len(paths) {
  1301  				c.logger.Debugf("Beta pre-staged %d/%d files", len(paths)-len(filteredPaths), len(paths))
  1302  			}
  1303  			if len(filteredPaths) > 0 {
  1304  				monitor := func(state *rsync.ReceiverState) error {
  1305  					c.stateLock.Lock()
  1306  					if state == nil {
  1307  						c.state.BetaState.StagingProgress = nil
  1308  					} else {
  1309  						if c.state.BetaState.StagingProgress == nil {
  1310  							c.state.BetaState.StagingProgress = &rsync.ReceiverState{}
  1311  						}
  1312  						proto.Merge(c.state.BetaState.StagingProgress, state)
  1313  					}
  1314  					c.stateLock.Unlock()
  1315  					return nil
  1316  				}
  1317  				receiver = rsync.NewMonitoringReceiver(receiver, filteredPaths, signatures, monitor)
  1318  				receiver = rsync.NewPreemptableReceiver(ctx, receiver)
  1319  				if err = alpha.Supply(filteredPaths, signatures, receiver); err != nil {
  1320  					return fmt.Errorf("unable to stage files on beta: %w", err)
  1321  				}
  1322  			}
  1323  		}
  1324  
  1325  		// Perform transitions on both endpoints in parallel. For each side that
  1326  		// doesn't completely error out, convert its results to ancestor
  1327  		// changes. Transition errors are checked later, once the ancestor has
  1328  		// been updated.
  1329  		c.stateLock.Lock()
  1330  		c.state.Status = Status_Transitioning
  1331  		c.stateLock.Unlock()
  1332  		var αResults, βResults []*core.Entry
  1333  		var αProblems, βProblems []*core.Problem
  1334  		var αMissingFiles, βMissingFiles bool
  1335  		var αTransitionErr, βTransitionErr error
  1336  		var αChanges, βChanges []*core.Change
  1337  		transitionDone := &sync.WaitGroup{}
  1338  		if len(αTransitions) > 0 {
  1339  			transitionDone.Add(1)
  1340  		}
  1341  		if len(βTransitions) > 0 {
  1342  			transitionDone.Add(1)
  1343  		}
  1344  		if len(αTransitions) > 0 {
  1345  			c.logger.Debug("Transitioning alpha")
  1346  			go func() {
  1347  				αResults, αProblems, αMissingFiles, αTransitionErr = alpha.Transition(ctx, αTransitions)
  1348  				if αTransitionErr == nil {
  1349  					for t, transition := range αTransitions {
  1350  						αChanges = append(αChanges, &core.Change{Path: transition.Path, New: αResults[t]})
  1351  					}
  1352  				}
  1353  				transitionDone.Done()
  1354  			}()
  1355  		}
  1356  		if len(βTransitions) > 0 {
  1357  			c.logger.Debug("Transitioning beta")
  1358  			go func() {
  1359  				βResults, βProblems, βMissingFiles, βTransitionErr = beta.Transition(ctx, βTransitions)
  1360  				if βTransitionErr == nil {
  1361  					for t, transition := range βTransitions {
  1362  						βChanges = append(βChanges, &core.Change{Path: transition.Path, New: βResults[t]})
  1363  					}
  1364  				}
  1365  				transitionDone.Done()
  1366  			}()
  1367  		}
  1368  		transitionDone.Wait()
  1369  
  1370  		// Record transition problems.
  1371  		c.stateLock.Lock()
  1372  		c.state.Status = Status_Saving
  1373  		c.state.AlphaState.TransitionProblems = αProblems
  1374  		c.state.BetaState.TransitionProblems = βProblems
  1375  		c.stateLock.Unlock()
  1376  
  1377  		// Fold applied changes into the ancestor's change list and update the
  1378  		// ancestor if any changes are present.
  1379  		ancestorChanges = append(ancestorChanges, αChanges...)
  1380  		ancestorChanges = append(ancestorChanges, βChanges...)
  1381  		if len(ancestorChanges) > 0 {
  1382  			// Apply the changes to the ancestor.
  1383  			if newAncestor, err := core.Apply(ancestor, ancestorChanges); err != nil {
  1384  				return fmt.Errorf("unable to propagate changes to ancestor: %w", err)
  1385  			} else {
  1386  				ancestor = newAncestor
  1387  			}
  1388  
  1389  			// Validate the new ancestor before saving it to ensure that our
  1390  			// reconciliation logic doesn't have any flaws. This is the only time
  1391  			// that we validate a data structure generated by code in the same
  1392  			// process (usually our tests are our validation), but this case is
  1393  			// special because (a) our test cases can't cover every real world
  1394  			// condition that might arise and (b) if we write a broken ancestor to
  1395  			// disk, the session is toast. This safety check ensures that even if we
  1396  			// put out a broken release, or encounter some bizarre real world merge
  1397  			// case that we didn't consider, things can be fixed.
  1398  			if err := ancestor.EnsureValid(true); err != nil {
  1399  				return fmt.Errorf("new ancestor is invalid: %w", err)
  1400  			}
  1401  
  1402  			// Save the ancestor.
  1403  			c.logger.Debug("Saving ancestor")
  1404  			archive.Content = ancestor
  1405  			if err := encoding.MarshalAndSaveProtobuf(c.archivePath, archive); err != nil {
  1406  				return fmt.Errorf("unable to save ancestor: %w", err)
  1407  			}
  1408  		}
  1409  
  1410  		// Now check for transition errors.
  1411  		if αTransitionErr != nil {
  1412  			return fmt.Errorf("unable to apply changes to alpha: %w", αTransitionErr)
  1413  		} else if βTransitionErr != nil {
  1414  			return fmt.Errorf("unable to apply changes to beta: %w", βTransitionErr)
  1415  		}
  1416  
  1417  		// If there were files missing from either endpoint's stager during the
  1418  		// transition operations, then there were likely concurrent
  1419  		// modifications during staging. If we see this, then skip polling and
  1420  		// attempt to run another synchronization cycle immediately, but only if
  1421  		// we're not already in a synchronization cycle that was forced due to
  1422  		// previously missing files.
  1423  		if (αMissingFiles || βMissingFiles) && !skippingPollingDueToMissingFiles {
  1424  			c.logger.Debug("Endpoint(s) missing files after transition, skipping polling")
  1425  			skipPolling = true
  1426  			skippingPollingDueToMissingFiles = true
  1427  		} else {
  1428  			skippingPollingDueToMissingFiles = false
  1429  		}
  1430  
  1431  		// Increment the synchronization cycle count.
  1432  		c.stateLock.Lock()
  1433  		c.state.SuccessfulCycles++
  1434  		c.stateLock.Unlock()
  1435  
  1436  		// If a flush request triggered this synchronization cycle, then tell it
  1437  		// that the cycle has completed and remove it from our tracking.
  1438  		if flushRequest != nil {
  1439  			flushRequest <- nil
  1440  			flushRequest = nil
  1441  		}
  1442  	}
  1443  }