github.com/koko1123/flow-go-1@v0.29.6/cmd/dynamic_startup.go (about)

     1  package cmd
     2  
     3  import (
     4  	"context"
     5  	"encoding/hex"
     6  	"encoding/json"
     7  	"fmt"
     8  	"path/filepath"
     9  	"strconv"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/rs/zerolog"
    14  	"github.com/sethvargo/go-retry"
    15  
    16  	client "github.com/onflow/flow-go-sdk/access/grpc"
    17  	"github.com/koko1123/flow-go-1/cmd/util/cmd/common"
    18  	"github.com/koko1123/flow-go-1/model/bootstrap"
    19  	"github.com/koko1123/flow-go-1/state/protocol"
    20  	badgerstate "github.com/koko1123/flow-go-1/state/protocol/badger"
    21  	utilsio "github.com/koko1123/flow-go-1/utils/io"
    22  	"github.com/onflow/flow-go/crypto"
    23  
    24  	"github.com/koko1123/flow-go-1/model/flow"
    25  	"github.com/koko1123/flow-go-1/state/protocol/inmem"
    26  )
    27  
    28  const getSnapshotTimeout = 30 * time.Second
    29  
    30  // GetProtocolSnapshot callback that will get latest finalized protocol snapshot
    31  type GetProtocolSnapshot func(ctx context.Context) (protocol.Snapshot, error)
    32  
    33  // GetSnapshot will attempt to get the latest finalized protocol snapshot with the given flow configs
    34  func GetSnapshot(ctx context.Context, client *client.Client) (*inmem.Snapshot, error) {
    35  	ctx, cancel := context.WithTimeout(ctx, getSnapshotTimeout)
    36  	defer cancel()
    37  
    38  	b, err := client.GetLatestProtocolStateSnapshot(ctx)
    39  	if err != nil {
    40  		return nil, fmt.Errorf("failed to get latest finalized protocol state snapshot during pre-initialization: %w", err)
    41  	}
    42  
    43  	var snapshotEnc inmem.EncodableSnapshot
    44  	err = json.Unmarshal(b, &snapshotEnc)
    45  	if err != nil {
    46  		return nil, fmt.Errorf("failed to unmarshal protocol state snapshot: %w", err)
    47  	}
    48  
    49  	snapshot := inmem.SnapshotFromEncodable(snapshotEnc)
    50  	return snapshot, nil
    51  }
    52  
    53  // GetSnapshotAtEpochAndPhase will get the latest finalized protocol snapshot and check the current epoch and epoch phase.
    54  // If we are past the target epoch and epoch phase we exit the retry mechanism immediately.
    55  // If not check the snapshot at the specified interval until we reach the target epoch and phase.
    56  func GetSnapshotAtEpochAndPhase(ctx context.Context, log zerolog.Logger, startupEpoch uint64, startupEpochPhase flow.EpochPhase, retryInterval time.Duration, getSnapshot GetProtocolSnapshot) (protocol.Snapshot, error) {
    57  	start := time.Now()
    58  
    59  	log = log.With().
    60  		Uint64("target_epoch_counter", startupEpoch).
    61  		Str("target_epoch_phase", startupEpochPhase.String()).
    62  		Logger()
    63  
    64  	log.Info().Msg("starting dynamic startup - waiting until target epoch/phase to start...")
    65  
    66  	var snapshot protocol.Snapshot
    67  	var err error
    68  
    69  	backoff := retry.NewConstant(retryInterval)
    70  	err = retry.Do(ctx, backoff, func(ctx context.Context) error {
    71  		snapshot, err = getSnapshot(ctx)
    72  		if err != nil {
    73  			err = fmt.Errorf("failed to get protocol snapshot: %w", err)
    74  			log.Error().Err(err).Msg("could not get protocol snapshot")
    75  			return retry.RetryableError(err)
    76  		}
    77  
    78  		// if we encounter any errors interpreting the snapshot something went wrong stop retrying
    79  		currEpochCounter, err := snapshot.Epochs().Current().Counter()
    80  		if err != nil {
    81  			return fmt.Errorf("failed to get the current epoch counter: %w", err)
    82  		}
    83  
    84  		currEpochPhase, err := snapshot.Phase()
    85  		if err != nil {
    86  			return fmt.Errorf("failed to get the current epoch phase: %w", err)
    87  		}
    88  
    89  		// check if we are in or past the target epoch and phase
    90  		if currEpochCounter > startupEpoch || (currEpochCounter == startupEpoch && currEpochPhase >= startupEpochPhase) {
    91  			log.Info().
    92  				Dur("time-waiting", time.Since(start)).
    93  				Uint64("current-epoch", currEpochCounter).
    94  				Str("current-epoch-phase", currEpochPhase.String()).
    95  				Msg("finished dynamic startup - reached desired epoch and phase")
    96  
    97  			return nil
    98  		}
    99  
   100  		// wait then poll for latest snapshot again
   101  		log.Info().
   102  			Dur("time-waiting", time.Since(start)).
   103  			Uint64("current-epoch", currEpochCounter).
   104  			Str("current-epoch-phase", currEpochPhase.String()).
   105  			Msgf("waiting for epoch %d and phase %s", startupEpoch, startupEpochPhase.String())
   106  
   107  		return retry.RetryableError(fmt.Errorf("dynamic startup epoch and epoch phase not reached"))
   108  	})
   109  	if err != nil {
   110  		return nil, fmt.Errorf("failed to wait for target epoch and phase: %w", err)
   111  	}
   112  
   113  	return snapshot, nil
   114  }
   115  
   116  // ValidateDynamicStartupFlags will validate flags necessary for dynamic node startup
   117  // - assert dynamic-startup-access-publickey  is valid ECDSA_P256 public key hex
   118  // - assert dynamic-startup-access-address is not empty
   119  // - assert dynamic-startup-startup-epoch-phase is > 0 (EpochPhaseUndefined)
   120  func ValidateDynamicStartupFlags(accessPublicKey, accessAddress string, startPhase flow.EpochPhase) error {
   121  	b, err := hex.DecodeString(strings.TrimPrefix(accessPublicKey, "0x"))
   122  	if err != nil {
   123  		return fmt.Errorf("invalid flag --dynamic-startup-access-publickey: %w", err)
   124  	}
   125  
   126  	_, err = crypto.DecodePublicKey(crypto.ECDSAP256, b)
   127  	if err != nil {
   128  		return fmt.Errorf("invalid flag --dynamic-startup-access-publickey: %w", err)
   129  	}
   130  
   131  	if accessAddress == "" {
   132  		return fmt.Errorf("invalid flag --dynamic-startup-access-address can not be empty")
   133  	}
   134  
   135  	if startPhase <= flow.EpochPhaseUndefined {
   136  		return fmt.Errorf("invalid flag --dynamic-startup-startup-epoch-phase unknown epoch phase")
   137  	}
   138  
   139  	return nil
   140  }
   141  
   142  // DynamicStartPreInit is the pre-init func that will check if a node has already bootstrapped
   143  // from a root protocol snapshot. If not attempt to get a protocol snapshot where the following
   144  // conditions are met.
   145  // 1. Target epoch < current epoch (in the past), set root snapshot to current snapshot
   146  // 2. Target epoch == "current", wait until target phase == current phase before setting root snapshot
   147  // 3. Target epoch > current epoch (in future), wait until target epoch and target phase is reached before
   148  // setting root snapshot
   149  func DynamicStartPreInit(nodeConfig *NodeConfig) error {
   150  	ctx := context.Background()
   151  
   152  	log := nodeConfig.Logger.With().Str("component", "dynamic-startup").Logger()
   153  
   154  	// skip dynamic startup if the protocol state is bootstrapped
   155  	isBootstrapped, err := badgerstate.IsBootstrapped(nodeConfig.DB)
   156  	if err != nil {
   157  		return fmt.Errorf("could not check if state is boostrapped: %w", err)
   158  	}
   159  	if isBootstrapped {
   160  		log.Info().Msg("protocol state already bootstrapped, skipping dynamic startup")
   161  		return nil
   162  	}
   163  
   164  	// skip dynamic startup if a root snapshot file is specified - this takes priority
   165  	rootSnapshotPath := filepath.Join(nodeConfig.BootstrapDir, bootstrap.PathRootProtocolStateSnapshot)
   166  	if utilsio.FileExists(rootSnapshotPath) {
   167  		log.Info().
   168  			Str("root_snapshot_path", rootSnapshotPath).
   169  			Msg("protocol state is not bootstrapped, will bootstrap using configured root snapshot file, skipping dynamic startup")
   170  		return nil
   171  	}
   172  
   173  	// get flow client with secure client connection to download protocol snapshot from access node
   174  	config, err := common.NewFlowClientConfig(nodeConfig.DynamicStartupANAddress, nodeConfig.DynamicStartupANPubkey, flow.ZeroID, false)
   175  	if err != nil {
   176  		return fmt.Errorf("failed to create flow client config for node dynamic startup pre-init: %w", err)
   177  	}
   178  
   179  	flowClient, err := common.FlowClient(config)
   180  	if err != nil {
   181  		return fmt.Errorf("failed to create flow client for node dynamic startup pre-init: %w", err)
   182  	}
   183  
   184  	getSnapshotFunc := func(ctx context.Context) (protocol.Snapshot, error) {
   185  		return GetSnapshot(ctx, flowClient)
   186  	}
   187  
   188  	// validate dynamic startup epoch flag
   189  	startupEpoch, err := validateDynamicStartEpochFlags(ctx, getSnapshotFunc, nodeConfig.DynamicStartupEpoch)
   190  	if err != nil {
   191  		return fmt.Errorf("failed to validate flag --dynamic-start-epoch: %w", err)
   192  	}
   193  
   194  	startupPhase := flow.GetEpochPhase(nodeConfig.DynamicStartupEpochPhase)
   195  
   196  	// validate the rest of the dynamic startup flags
   197  	err = ValidateDynamicStartupFlags(nodeConfig.DynamicStartupANPubkey, nodeConfig.DynamicStartupANAddress, startupPhase)
   198  	if err != nil {
   199  		return err
   200  	}
   201  
   202  	snapshot, err := GetSnapshotAtEpochAndPhase(
   203  		ctx,
   204  		log,
   205  		startupEpoch,
   206  		startupPhase,
   207  		nodeConfig.BaseConfig.DynamicStartupSleepInterval,
   208  		getSnapshotFunc,
   209  	)
   210  	if err != nil {
   211  		return fmt.Errorf("failed to get snapshot at start up epoch (%d) and phase (%s): %w", startupEpoch, startupPhase.String(), err)
   212  	}
   213  
   214  	// set the root snapshot in the config - we will use this later to bootstrap
   215  	nodeConfig.RootSnapshot = snapshot
   216  	return nil
   217  }
   218  
   219  // validateDynamicStartEpochFlags parse the start epoch flag and return the uin64 value,
   220  // if epoch = current return the current epoch counter
   221  func validateDynamicStartEpochFlags(ctx context.Context, getSnapshot GetProtocolSnapshot, flagEpoch string) (uint64, error) {
   222  
   223  	// if flag is not `current` sentinel, it must be a specific epoch counter (uint64)
   224  	if flagEpoch != "current" {
   225  		epochCounter, err := strconv.ParseUint(flagEpoch, 10, 64)
   226  		if err != nil {
   227  			return 0, fmt.Errorf("invalid epoch counter flag (%s): %w", flagEpoch, err)
   228  		}
   229  		return epochCounter, nil
   230  	}
   231  
   232  	// we are using the current epoch, retrieve latest snapshot to determine this value
   233  	snapshot, err := getSnapshot(ctx)
   234  	if err != nil {
   235  		return 0, fmt.Errorf("failed to get snapshot: %w", err)
   236  	}
   237  
   238  	epochCounter, err := snapshot.Epochs().Current().Counter()
   239  	if err != nil {
   240  		return 0, fmt.Errorf("failed to get current epoch counter: %w", err)
   241  	}
   242  
   243  	return epochCounter, nil
   244  }