github.com/ethereum-optimism/optimism@v1.7.2/packages/chain-mon/src/replica-mon/service.ts (about) 1 import { Provider, Block } from '@ethersproject/abstract-provider' 2 import { 3 BaseServiceV2, 4 StandardOptions, 5 Counter, 6 Gauge, 7 validators, 8 } from '@eth-optimism/common-ts' 9 import { sleep } from '@eth-optimism/core-utils' 10 11 import { version } from '../../package.json' 12 13 type HealthcheckOptions = { 14 referenceRpcProvider: Provider 15 targetRpcProvider: Provider 16 onDivergenceWaitMs?: number 17 } 18 19 type HealthcheckMetrics = { 20 lastMatchingStateRootHeight: Gauge 21 isCurrentlyDiverged: Gauge 22 referenceHeight: Gauge 23 targetHeight: Gauge 24 heightDifference: Gauge 25 targetConnectionFailures: Counter 26 referenceConnectionFailures: Counter 27 } 28 29 type HealthcheckState = {} 30 31 export class HealthcheckService extends BaseServiceV2< 32 HealthcheckOptions, 33 HealthcheckMetrics, 34 HealthcheckState 35 > { 36 constructor(options?: Partial<HealthcheckOptions & StandardOptions>) { 37 super({ 38 version, 39 name: 'healthcheck', 40 options: { 41 loopIntervalMs: 5000, 42 ...options, 43 }, 44 optionsSpec: { 45 referenceRpcProvider: { 46 validator: validators.provider, 47 desc: 'Provider for interacting with L1', 48 }, 49 targetRpcProvider: { 50 validator: validators.provider, 51 desc: 'Provider for interacting with L2', 52 }, 53 onDivergenceWaitMs: { 54 validator: validators.num, 55 desc: 'Waiting time in ms per loop when divergence is detected', 56 default: 60_000, 57 public: true, 58 }, 59 }, 60 metricsSpec: { 61 lastMatchingStateRootHeight: { 62 type: Gauge, 63 desc: 'Highest matching state root between target and reference', 64 }, 65 isCurrentlyDiverged: { 66 type: Gauge, 67 desc: 'Whether or not the two nodes are currently diverged', 68 }, 69 referenceHeight: { 70 type: Gauge, 71 desc: 'Block height of the reference client', 72 }, 73 targetHeight: { 74 type: Gauge, 75 desc: 'Block height of the target client', 76 }, 77 heightDifference: { 78 type: Gauge, 79 desc: 'Difference in block heights between the two clients', 80 }, 81 targetConnectionFailures: { 82 type: Counter, 83 desc: 'Number of connection failures to the target client', 84 }, 85 referenceConnectionFailures: { 86 type: Counter, 87 desc: 'Number of connection failures to the reference client', 88 }, 89 }, 90 }) 91 } 92 93 async main() { 94 // Get the latest block from the target client and check for connection failures. 95 let targetLatest: Block 96 try { 97 targetLatest = await this.options.targetRpcProvider.getBlock('latest') 98 } catch (err) { 99 if (err.message.includes('could not detect network')) { 100 this.logger.error('target client not connected') 101 this.metrics.targetConnectionFailures.inc() 102 return 103 } else { 104 throw err 105 } 106 } 107 108 // Get the latest block from the reference client and check for connection failures. 109 let referenceLatest: Block 110 try { 111 referenceLatest = await this.options.referenceRpcProvider.getBlock( 112 'latest' 113 ) 114 } catch (err) { 115 if (err.message.includes('could not detect network')) { 116 this.logger.error('reference client not connected') 117 this.metrics.referenceConnectionFailures.inc() 118 return 119 } else { 120 throw err 121 } 122 } 123 124 // Later logic will depend on the height difference. 125 const heightDiff = Math.abs(referenceLatest.number - targetLatest.number) 126 const minBlock = Math.min(targetLatest.number, referenceLatest.number) 127 128 // Update these metrics first so they'll refresh no matter what. 129 this.metrics.targetHeight.set(targetLatest.number) 130 this.metrics.referenceHeight.set(referenceLatest.number) 131 this.metrics.heightDifference.set(heightDiff) 132 133 this.logger.info(`latest block heights`, { 134 targetHeight: targetLatest.number, 135 referenceHeight: referenceLatest.number, 136 heightDifference: heightDiff, 137 minBlockNumber: minBlock, 138 }) 139 140 const reference = await this.options.referenceRpcProvider.getBlock(minBlock) 141 if (!reference) { 142 // This is ok, but we should log it and restart the loop. 143 this.logger.info(`reference block was not found`, { 144 blockNumber: reference.number, 145 }) 146 return 147 } 148 149 const target = await this.options.targetRpcProvider.getBlock(minBlock) 150 if (!target) { 151 // This is ok, but we should log it and restart the loop. 152 this.logger.info(`target block was not found`, { 153 blockNumber: target.number, 154 }) 155 return 156 } 157 158 // We used to use state roots here, but block hashes are even more reliable because they will 159 // catch discrepancies in blocks that may not impact the state. For example, if clients have 160 // blocks with two different timestamps, the state root will only diverge if the timestamp is 161 // actually used during the transaction(s) within the block. 162 if (reference.hash !== target.hash) { 163 this.logger.error(`reference client has different hash for block`, { 164 blockNumber: target.number, 165 referenceHash: reference.hash, 166 targetHash: target.hash, 167 }) 168 169 // The main loop polls for "latest" so aren't checking every block. We need to use a binary 170 // search to find the first block where a mismatch occurred. 171 this.logger.info(`beginning binary search to find first mismatched block`) 172 173 let start = 0 174 let end = target.number 175 while (start !== end) { 176 const mid = Math.floor((start + end) / 2) 177 this.logger.info(`checking block`, { blockNumber: mid }) 178 const blockA = await this.options.referenceRpcProvider.getBlock(mid) 179 const blockB = await this.options.targetRpcProvider.getBlock(mid) 180 181 if (blockA.hash === blockB.hash) { 182 start = mid + 1 183 } else { 184 end = mid 185 } 186 } 187 188 this.logger.info(`found first mismatched block`, { blockNumber: end }) 189 this.metrics.lastMatchingStateRootHeight.set(end) 190 this.metrics.isCurrentlyDiverged.set(1) 191 192 // Old version of the service would exit here, but we want to keep looping just in case the 193 // the system recovers later. This is better than exiting because it means we don't have to 194 // restart the entire service. Running these checks once per minute will not trigger too many 195 // requests, so this should be fine. 196 await sleep(this.options.onDivergenceWaitMs) 197 return 198 } 199 200 this.logger.info(`blocks are matching`, { 201 blockNumber: target.number, 202 }) 203 204 // Update latest matching state root height and reset the diverged metric in case it was set. 205 this.metrics.lastMatchingStateRootHeight.set(target.number) 206 this.metrics.isCurrentlyDiverged.set(0) 207 } 208 } 209 210 if (require.main === module) { 211 const service = new HealthcheckService() 212 service.run() 213 }