github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/consensus/hotstuff/notifications/telemetry.go (about)

     1  package notifications
     2  
     3  import (
     4  	"time"
     5  
     6  	"github.com/google/uuid"
     7  	"github.com/rs/zerolog"
     8  
     9  	"github.com/onflow/flow-go/consensus/hotstuff"
    10  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    11  	"github.com/onflow/flow-go/model/flow"
    12  	"github.com/onflow/flow-go/utils/logging"
    13  )
    14  
    15  // TelemetryConsumer implements the hotstuff.Consumer interface.
    16  // consumes outbound notifications produced by the HotStuff state machine.
    17  // For this purpose, the TelemetryConsumer enriches the state machine's notifications:
    18  //   - The goal is to identify all events as belonging together that were emitted during
    19  //     a path through the state machine.
    20  //   - A path through the state machine begins when:
    21  //     -- a block has been received
    22  //     -- a QC has been constructed
    23  //     -- a TC has been constructed
    24  //     -- a partial TC has been constructed
    25  //     -- a local timeout has been initiated
    26  //   - Each path through the state machine is identified by a unique id.
    27  //
    28  // Additionally, the TelemetryConsumer reports events related to vote and timeout aggregation
    29  // but those events are not bound to a path, so they are reported differently.
    30  // Generally, the TelemetryConsumer could export the collected data to a variety of backends.
    31  // For now, we export the data to a logger.
    32  //
    33  // Telemetry does NOT capture slashing notifications
    34  type TelemetryConsumer struct {
    35  	NoopTimeoutCollectorConsumer
    36  	NoopVoteCollectorConsumer
    37  	pathHandler  *PathHandler
    38  	noPathLogger zerolog.Logger
    39  }
    40  
    41  // Telemetry implements consumers for _all happy-path_ interfaces in consensus/hotstuff/notifications/telemetry.go:
    42  var _ hotstuff.ParticipantConsumer = (*TelemetryConsumer)(nil)
    43  var _ hotstuff.CommunicatorConsumer = (*TelemetryConsumer)(nil)
    44  var _ hotstuff.FinalizationConsumer = (*TelemetryConsumer)(nil)
    45  var _ hotstuff.VoteCollectorConsumer = (*TelemetryConsumer)(nil)
    46  var _ hotstuff.TimeoutCollectorConsumer = (*TelemetryConsumer)(nil)
    47  
    48  // NewTelemetryConsumer creates consumer that reports telemetry events using logger backend.
    49  // Logger MUST include `chain` parameter as part of log context with corresponding chain ID to correctly map telemetry events to chain.
    50  func NewTelemetryConsumer(log zerolog.Logger) *TelemetryConsumer {
    51  	pathHandler := NewPathHandler(log)
    52  	return &TelemetryConsumer{
    53  		pathHandler:  pathHandler,
    54  		noPathLogger: pathHandler.log,
    55  	}
    56  }
    57  
    58  func (t *TelemetryConsumer) OnStart(currentView uint64) {
    59  	t.pathHandler.StartNextPath(currentView)
    60  	t.pathHandler.NextStep().Msg("OnStart")
    61  }
    62  
    63  func (t *TelemetryConsumer) OnReceiveProposal(currentView uint64, proposal *model.Proposal) {
    64  	block := proposal.Block
    65  	t.pathHandler.StartNextPath(currentView)
    66  	step := t.pathHandler.NextStep().
    67  		Uint64("block_view", block.View).
    68  		Hex("block_id", logging.ID(block.BlockID)).
    69  		Hex("block_proposer_id", logging.ID(block.ProposerID)).
    70  		Time("block_time", block.Timestamp).
    71  		Uint64("qc_view", block.QC.View).
    72  		Hex("qc_block_id", logging.ID(block.QC.BlockID))
    73  
    74  	lastViewTC := proposal.LastViewTC
    75  	if lastViewTC != nil {
    76  		step.
    77  			Uint64("last_view_tc_view", lastViewTC.View).
    78  			Uint64("last_view_tc_newest_qc_view", lastViewTC.NewestQC.View).
    79  			Hex("last_view_tc_newest_qc_block_id", logging.ID(lastViewTC.NewestQC.BlockID))
    80  	}
    81  
    82  	step.Msg("OnReceiveProposal")
    83  }
    84  
    85  func (t *TelemetryConsumer) OnReceiveQc(currentView uint64, qc *flow.QuorumCertificate) {
    86  	t.pathHandler.StartNextPath(currentView)
    87  	t.pathHandler.NextStep().
    88  		Uint64("qc_view", qc.View).
    89  		Hex("qc_block_id", logging.ID(qc.BlockID)).
    90  		Msg("OnReceiveQc")
    91  }
    92  
    93  func (t *TelemetryConsumer) OnReceiveTc(currentView uint64, tc *flow.TimeoutCertificate) {
    94  	t.pathHandler.StartNextPath(currentView)
    95  	t.pathHandler.NextStep().
    96  		Uint64("view", tc.View).
    97  		Uint64("newest_qc_view", tc.NewestQC.View).
    98  		Hex("newest_qc_block_id", logging.ID(tc.NewestQC.BlockID)).
    99  		Msg("OnReceiveTc")
   100  }
   101  
   102  func (t *TelemetryConsumer) OnPartialTc(currentView uint64, partialTc *hotstuff.PartialTcCreated) {
   103  	t.pathHandler.StartNextPath(currentView)
   104  	step := t.pathHandler.NextStep().
   105  		Uint64("view", partialTc.View).
   106  		Uint64("newest_qc_view", partialTc.NewestQC.View).
   107  		Hex("newest_qc_block_id", logging.ID(partialTc.NewestQC.BlockID))
   108  
   109  	lastViewTC := partialTc.LastViewTC
   110  	if lastViewTC != nil {
   111  		step.
   112  			Uint64("last_view_tc_view", lastViewTC.View).
   113  			Uint64("last_view_tc_newest_qc_view", lastViewTC.NewestQC.View).
   114  			Hex("last_view_tc_newest_qc_block_id", logging.ID(lastViewTC.NewestQC.BlockID))
   115  	}
   116  
   117  	step.Msg("OnPartialTc")
   118  }
   119  
   120  func (t *TelemetryConsumer) OnLocalTimeout(currentView uint64) {
   121  	t.pathHandler.StartNextPath(currentView)
   122  	t.pathHandler.NextStep().Msg("OnLocalTimeout")
   123  }
   124  
   125  func (t *TelemetryConsumer) OnEventProcessed() {
   126  	if t.pathHandler.IsCurrentPathClosed() {
   127  		return
   128  	}
   129  	t.pathHandler.NextStep().Msg("PathCompleted")
   130  	t.pathHandler.NextStep().Msg("OnEventProcessed")
   131  	t.pathHandler.CloseCurrentPath()
   132  }
   133  
   134  func (t *TelemetryConsumer) OnStartingTimeout(info model.TimerInfo) {
   135  	t.pathHandler.NextStep().
   136  		Float64("timeout_duration_seconds", info.Duration.Seconds()).
   137  		Time("timeout_cutoff", info.StartTime.Add(info.Duration)).
   138  		Msg("OnStartingTimeout")
   139  }
   140  
   141  func (t *TelemetryConsumer) OnBlockIncorporated(block *model.Block) {
   142  	t.pathHandler.NextStep().
   143  		Hex("block_id", logging.ID(block.BlockID)).
   144  		Msg("OnBlockIncorporated")
   145  }
   146  
   147  func (t *TelemetryConsumer) OnFinalizedBlock(block *model.Block) {
   148  	t.pathHandler.NextStep().
   149  		Hex("block_id", logging.ID(block.BlockID)).
   150  		Msg("OnFinalizedBlock")
   151  }
   152  
   153  func (t *TelemetryConsumer) OnQcTriggeredViewChange(oldView uint64, newView uint64, qc *flow.QuorumCertificate) {
   154  	t.pathHandler.NextStep().
   155  		Uint64("qc_view", qc.View).
   156  		Uint64("old_view", oldView).
   157  		Uint64("next_view", newView).
   158  		Hex("qc_block_id", qc.BlockID[:]).
   159  		Msg("OnQcTriggeredViewChange")
   160  }
   161  
   162  func (t *TelemetryConsumer) OnTcTriggeredViewChange(oldView uint64, newView uint64, tc *flow.TimeoutCertificate) {
   163  	t.pathHandler.NextStep().
   164  		Uint64("tc_view", tc.View).
   165  		Uint64("old_view", oldView).
   166  		Uint64("next_view", newView).
   167  		Uint64("tc_newest_qc_view", tc.NewestQC.View).
   168  		Hex("tc_newest_qc_block_id", tc.NewestQC.BlockID[:]).
   169  		Msg("OnTcTriggeredViewChange")
   170  }
   171  
   172  func (t *TelemetryConsumer) OnOwnVote(blockID flow.Identifier, view uint64, _ []byte, recipientID flow.Identifier) {
   173  	t.pathHandler.NextStep().
   174  		Uint64("voted_block_view", view).
   175  		Hex("voted_block_id", logging.ID(blockID)).
   176  		Hex("recipient_id", logging.ID(recipientID)).
   177  		Msg("OnOwnVote")
   178  }
   179  
   180  func (t *TelemetryConsumer) OnOwnProposal(proposal *flow.Header, targetPublicationTime time.Time) {
   181  	step := t.pathHandler.NextStep().
   182  		Uint64("block_view", proposal.View).
   183  		Hex("block_id", logging.ID(proposal.ID())).
   184  		Hex("block_proposer_id", logging.ID(proposal.ProposerID)).
   185  		Time("block_time", proposal.Timestamp).
   186  		Uint64("qc_view", proposal.ParentView).
   187  		Hex("qc_block_id", logging.ID(proposal.ParentID)).
   188  		Time("targetPublicationTime", targetPublicationTime)
   189  	lastViewTC := proposal.LastViewTC
   190  	if lastViewTC != nil {
   191  		step.
   192  			Uint64("last_view_tc_view", lastViewTC.View).
   193  			Uint64("last_view_tc_newest_qc_view", lastViewTC.NewestQC.View).
   194  			Hex("last_view_tc_newest_qc_block_id", logging.ID(lastViewTC.NewestQC.BlockID))
   195  	}
   196  	step.Msg("OnOwnProposal")
   197  }
   198  
   199  func (t *TelemetryConsumer) OnOwnTimeout(timeout *model.TimeoutObject) {
   200  	step := t.pathHandler.NextStep().
   201  		Uint64("view", timeout.View).
   202  		Uint64("timeout_tick", timeout.TimeoutTick).
   203  		Uint64("newest_qc_view", timeout.NewestQC.View).
   204  		Hex("newest_qc_block_id", logging.ID(timeout.NewestQC.BlockID))
   205  
   206  	lastViewTC := timeout.LastViewTC
   207  	if lastViewTC != nil {
   208  		step.
   209  			Uint64("last_view_tc_view", lastViewTC.View).
   210  			Uint64("last_view_tc_newest_qc_view", lastViewTC.NewestQC.View).
   211  			Hex("last_view_tc_newest_qc_block_id", logging.ID(lastViewTC.NewestQC.BlockID))
   212  	}
   213  	step.Msg("OnOwnTimeout")
   214  }
   215  
   216  func (t *TelemetryConsumer) OnVoteProcessed(vote *model.Vote) {
   217  	t.noPathLogger.Info().
   218  		Uint64("voted_block_view", vote.View).
   219  		Hex("voted_block_id", logging.ID(vote.BlockID)).
   220  		Hex("signer_id", logging.ID(vote.SignerID)).
   221  		Msg("OnVoteProcessed")
   222  }
   223  
   224  func (t *TelemetryConsumer) OnTimeoutProcessed(timeout *model.TimeoutObject) {
   225  	step := t.noPathLogger.Info().
   226  		Uint64("view", timeout.View).
   227  		Uint64("timeout_tick", timeout.TimeoutTick).
   228  		Uint64("newest_qc_view", timeout.NewestQC.View).
   229  		Hex("newest_qc_block_id", logging.ID(timeout.NewestQC.BlockID)).
   230  		Hex("signer_id", logging.ID(timeout.SignerID))
   231  	lastViewTC := timeout.LastViewTC
   232  	if lastViewTC != nil {
   233  		step.
   234  			Uint64("last_view_tc_view", lastViewTC.View).
   235  			Uint64("last_view_tc_newest_qc_view", lastViewTC.NewestQC.View).
   236  			Hex("last_view_tc_newest_qc_block_id", logging.ID(lastViewTC.NewestQC.BlockID))
   237  	}
   238  	step.Msg("OnTimeoutProcessed")
   239  }
   240  
   241  func (t *TelemetryConsumer) OnCurrentViewDetails(currentView, finalizedView uint64, currentLeader flow.Identifier) {
   242  	t.pathHandler.NextStep().
   243  		Uint64("view", currentView).
   244  		Uint64("finalized_view", finalizedView).
   245  		Hex("current_leader", currentLeader[:]).
   246  		Msg("OnCurrentViewDetails")
   247  }
   248  
   249  func (t *TelemetryConsumer) OnViewChange(oldView, newView uint64) {
   250  	t.pathHandler.NextStep().
   251  		Uint64("old_view", oldView).
   252  		Uint64("new_view", newView).
   253  		Msg("OnViewChange")
   254  }
   255  
   256  // PathHandler maintains a notion of the current path through the state machine.
   257  // It allows to close a path and open new path. Each path is identified by a unique
   258  // (randomly generated) uuid. Along each path, we can capture information about relevant
   259  // Steps (each step is represented by a Zerolog Event).
   260  // In case there is no currently open path, the PathHandler still returns a Step,
   261  // but such steps are logged as telemetry errors.
   262  type PathHandler struct {
   263  	log zerolog.Logger
   264  
   265  	// currentPath holds a Zerolog Context with the information about the current path.
   266  	// We represent the case where the current path has been closed by nil value.
   267  	currentPath *zerolog.Context
   268  }
   269  
   270  // NewPathHandler instantiate a new PathHandler.
   271  // The PathHandler has no currently open path
   272  // Logger MUST include `chain` parameter as part of log context with corresponding chain ID to correctly map telemetry events to chain.
   273  func NewPathHandler(log zerolog.Logger) *PathHandler {
   274  	return &PathHandler{
   275  		log:         log.With().Str("component", "hotstuff.telemetry").Logger(),
   276  		currentPath: nil,
   277  	}
   278  }
   279  
   280  // StartNextPath starts a new Path. Implicitly closes previous path if still open.
   281  // Returns self-reference for chaining
   282  func (p *PathHandler) StartNextPath(view uint64) *PathHandler {
   283  	if p.currentPath != nil {
   284  		p.NextStep().Msg("PathCompleted")
   285  	}
   286  	c := p.log.With().
   287  		Str("path_id", uuid.New().String()).
   288  		Uint64("view", view)
   289  	p.currentPath = &c
   290  	return p
   291  }
   292  
   293  // CloseCurrentPath closes the current path. Repeated calls to CloseCurrentPath are handled.
   294  // All Details hereafter, until a new Path is started, are logged as telemetry Errors.
   295  // Returns self-reference for chaining
   296  func (p *PathHandler) CloseCurrentPath() *PathHandler {
   297  	p.currentPath = nil
   298  	return p
   299  }
   300  
   301  // IsCurrentPathClosed if and only if the most recently started path has been closed.
   302  func (p *PathHandler) IsCurrentPathClosed() bool {
   303  	return p.currentPath == nil
   304  }
   305  
   306  // NextStep returns a Zerolog event for the currently open path. If the current path
   307  // is closed, the event will be logged as telemetry error.
   308  func (p *PathHandler) NextStep() *zerolog.Event {
   309  	if p.currentPath == nil {
   310  		l := p.log.With().Str("error", "no path").Logger()
   311  		return l.Error()
   312  	}
   313  	l := p.currentPath.Logger()
   314  	return l.Info()
   315  }