github.com/ethereum-optimism/optimism@v1.7.2/packages/chain-mon/src/replica-mon/service.ts (about)

     1  import { Provider, Block } from '@ethersproject/abstract-provider'
     2  import {
     3    BaseServiceV2,
     4    StandardOptions,
     5    Counter,
     6    Gauge,
     7    validators,
     8  } from '@eth-optimism/common-ts'
     9  import { sleep } from '@eth-optimism/core-utils'
    10  
    11  import { version } from '../../package.json'
    12  
    13  type HealthcheckOptions = {
    14    referenceRpcProvider: Provider
    15    targetRpcProvider: Provider
    16    onDivergenceWaitMs?: number
    17  }
    18  
    19  type HealthcheckMetrics = {
    20    lastMatchingStateRootHeight: Gauge
    21    isCurrentlyDiverged: Gauge
    22    referenceHeight: Gauge
    23    targetHeight: Gauge
    24    heightDifference: Gauge
    25    targetConnectionFailures: Counter
    26    referenceConnectionFailures: Counter
    27  }
    28  
    29  type HealthcheckState = {}
    30  
    31  export class HealthcheckService extends BaseServiceV2<
    32    HealthcheckOptions,
    33    HealthcheckMetrics,
    34    HealthcheckState
    35  > {
    36    constructor(options?: Partial<HealthcheckOptions & StandardOptions>) {
    37      super({
    38        version,
    39        name: 'healthcheck',
    40        options: {
    41          loopIntervalMs: 5000,
    42          ...options,
    43        },
    44        optionsSpec: {
    45          referenceRpcProvider: {
    46            validator: validators.provider,
    47            desc: 'Provider for interacting with L1',
    48          },
    49          targetRpcProvider: {
    50            validator: validators.provider,
    51            desc: 'Provider for interacting with L2',
    52          },
    53          onDivergenceWaitMs: {
    54            validator: validators.num,
    55            desc: 'Waiting time in ms per loop when divergence is detected',
    56            default: 60_000,
    57            public: true,
    58          },
    59        },
    60        metricsSpec: {
    61          lastMatchingStateRootHeight: {
    62            type: Gauge,
    63            desc: 'Highest matching state root between target and reference',
    64          },
    65          isCurrentlyDiverged: {
    66            type: Gauge,
    67            desc: 'Whether or not the two nodes are currently diverged',
    68          },
    69          referenceHeight: {
    70            type: Gauge,
    71            desc: 'Block height of the reference client',
    72          },
    73          targetHeight: {
    74            type: Gauge,
    75            desc: 'Block height of the target client',
    76          },
    77          heightDifference: {
    78            type: Gauge,
    79            desc: 'Difference in block heights between the two clients',
    80          },
    81          targetConnectionFailures: {
    82            type: Counter,
    83            desc: 'Number of connection failures to the target client',
    84          },
    85          referenceConnectionFailures: {
    86            type: Counter,
    87            desc: 'Number of connection failures to the reference client',
    88          },
    89        },
    90      })
    91    }
    92  
    93    async main() {
    94      // Get the latest block from the target client and check for connection failures.
    95      let targetLatest: Block
    96      try {
    97        targetLatest = await this.options.targetRpcProvider.getBlock('latest')
    98      } catch (err) {
    99        if (err.message.includes('could not detect network')) {
   100          this.logger.error('target client not connected')
   101          this.metrics.targetConnectionFailures.inc()
   102          return
   103        } else {
   104          throw err
   105        }
   106      }
   107  
   108      // Get the latest block from the reference client and check for connection failures.
   109      let referenceLatest: Block
   110      try {
   111        referenceLatest = await this.options.referenceRpcProvider.getBlock(
   112          'latest'
   113        )
   114      } catch (err) {
   115        if (err.message.includes('could not detect network')) {
   116          this.logger.error('reference client not connected')
   117          this.metrics.referenceConnectionFailures.inc()
   118          return
   119        } else {
   120          throw err
   121        }
   122      }
   123  
   124      // Later logic will depend on the height difference.
   125      const heightDiff = Math.abs(referenceLatest.number - targetLatest.number)
   126      const minBlock = Math.min(targetLatest.number, referenceLatest.number)
   127  
   128      // Update these metrics first so they'll refresh no matter what.
   129      this.metrics.targetHeight.set(targetLatest.number)
   130      this.metrics.referenceHeight.set(referenceLatest.number)
   131      this.metrics.heightDifference.set(heightDiff)
   132  
   133      this.logger.info(`latest block heights`, {
   134        targetHeight: targetLatest.number,
   135        referenceHeight: referenceLatest.number,
   136        heightDifference: heightDiff,
   137        minBlockNumber: minBlock,
   138      })
   139  
   140      const reference = await this.options.referenceRpcProvider.getBlock(minBlock)
   141      if (!reference) {
   142        // This is ok, but we should log it and restart the loop.
   143        this.logger.info(`reference block was not found`, {
   144          blockNumber: reference.number,
   145        })
   146        return
   147      }
   148  
   149      const target = await this.options.targetRpcProvider.getBlock(minBlock)
   150      if (!target) {
   151        // This is ok, but we should log it and restart the loop.
   152        this.logger.info(`target block was not found`, {
   153          blockNumber: target.number,
   154        })
   155        return
   156      }
   157  
   158      // We used to use state roots here, but block hashes are even more reliable because they will
   159      // catch discrepancies in blocks that may not impact the state. For example, if clients have
   160      // blocks with two different timestamps, the state root will only diverge if the timestamp is
   161      // actually used during the transaction(s) within the block.
   162      if (reference.hash !== target.hash) {
   163        this.logger.error(`reference client has different hash for block`, {
   164          blockNumber: target.number,
   165          referenceHash: reference.hash,
   166          targetHash: target.hash,
   167        })
   168  
   169        // The main loop polls for "latest" so aren't checking every block. We need to use a binary
   170        // search to find the first block where a mismatch occurred.
   171        this.logger.info(`beginning binary search to find first mismatched block`)
   172  
   173        let start = 0
   174        let end = target.number
   175        while (start !== end) {
   176          const mid = Math.floor((start + end) / 2)
   177          this.logger.info(`checking block`, { blockNumber: mid })
   178          const blockA = await this.options.referenceRpcProvider.getBlock(mid)
   179          const blockB = await this.options.targetRpcProvider.getBlock(mid)
   180  
   181          if (blockA.hash === blockB.hash) {
   182            start = mid + 1
   183          } else {
   184            end = mid
   185          }
   186        }
   187  
   188        this.logger.info(`found first mismatched block`, { blockNumber: end })
   189        this.metrics.lastMatchingStateRootHeight.set(end)
   190        this.metrics.isCurrentlyDiverged.set(1)
   191  
   192        // Old version of the service would exit here, but we want to keep looping just in case the
   193        // the system recovers later. This is better than exiting because it means we don't have to
   194        // restart the entire service. Running these checks once per minute will not trigger too many
   195        // requests, so this should be fine.
   196        await sleep(this.options.onDivergenceWaitMs)
   197        return
   198      }
   199  
   200      this.logger.info(`blocks are matching`, {
   201        blockNumber: target.number,
   202      })
   203  
   204      // Update latest matching state root height and reset the diverged metric in case it was set.
   205      this.metrics.lastMatchingStateRootHeight.set(target.number)
   206      this.metrics.isCurrentlyDiverged.set(0)
   207    }
   208  }
   209  
   210  if (require.main === module) {
   211    const service = new HealthcheckService()
   212    service.run()
   213  }