github.com/koko1123/flow-go-1@v0.29.6/ledger/complete/wal/wal.go (about) 1 package wal 2 3 import ( 4 "fmt" 5 "sort" 6 7 prometheusWAL "github.com/m4ksio/wal/wal" 8 "github.com/prometheus/client_golang/prometheus" 9 "github.com/rs/zerolog" 10 11 "github.com/koko1123/flow-go-1/ledger" 12 "github.com/koko1123/flow-go-1/ledger/complete/mtrie" 13 "github.com/koko1123/flow-go-1/ledger/complete/mtrie/trie" 14 "github.com/koko1123/flow-go-1/module" 15 ) 16 17 const SegmentSize = 32 * 1024 * 1024 // 32 MB 18 19 type DiskWAL struct { 20 wal *prometheusWAL.WAL 21 paused bool 22 forestCapacity int 23 pathByteSize int 24 log zerolog.Logger 25 dir string 26 } 27 28 // TODO use real logger and metrics, but that would require passing them to Trie storage 29 func NewDiskWAL(logger zerolog.Logger, reg prometheus.Registerer, metrics module.WALMetrics, dir string, forestCapacity int, pathByteSize int, segmentSize int) (*DiskWAL, error) { 30 w, err := prometheusWAL.NewSize(logger, reg, dir, segmentSize, false) 31 if err != nil { 32 return nil, err 33 } 34 return &DiskWAL{ 35 wal: w, 36 paused: false, 37 forestCapacity: forestCapacity, 38 pathByteSize: pathByteSize, 39 log: logger.With().Str("ledger_mod", "diskwal").Logger(), 40 dir: dir, 41 }, nil 42 } 43 44 func (w *DiskWAL) PauseRecord() { 45 w.paused = true 46 } 47 48 func (w *DiskWAL) UnpauseRecord() { 49 w.paused = false 50 } 51 52 // RecordUpdate writes the trie update to the write ahead log on disk. 53 // if write ahead logging is not paused, it returns the file num (write ahead log) that the trie update was written to. 54 // if write ahead logging is enabled, the second returned value is false, otherwise it's true, meaning WAL is disabled. 55 func (w *DiskWAL) RecordUpdate(update *ledger.TrieUpdate) (segmentNum int, skipped bool, err error) { 56 if w.paused { 57 return 0, true, nil 58 } 59 60 bytes := EncodeUpdate(update) 61 62 locations, err := w.wal.Log(bytes) 63 64 if err != nil { 65 return 0, false, fmt.Errorf("error while recording update in LedgerWAL: %w", err) 66 } 67 if len(locations) != 1 { 68 return 0, false, fmt.Errorf("error while recording update in LedgerWAL: got %d location, expect 1 location", len(locations)) 69 } 70 71 return locations[0].Segment, false, nil 72 } 73 74 func (w *DiskWAL) RecordDelete(rootHash ledger.RootHash) error { 75 if w.paused { 76 return nil 77 } 78 79 bytes := EncodeDelete(rootHash) 80 81 _, err := w.wal.Log(bytes) 82 83 if err != nil { 84 return fmt.Errorf("error while recording delete in LedgerWAL: %w", err) 85 } 86 return nil 87 } 88 89 func (w *DiskWAL) ReplayOnForest(forest *mtrie.Forest) error { 90 return w.Replay( 91 func(tries []*trie.MTrie) error { 92 err := forest.AddTries(tries) 93 if err != nil { 94 return fmt.Errorf("adding rebuilt tries to forest failed: %w", err) 95 } 96 return nil 97 }, 98 func(update *ledger.TrieUpdate) error { 99 _, err := forest.Update(update) 100 return err 101 }, 102 func(rootHash ledger.RootHash) error { 103 return nil 104 }, 105 ) 106 } 107 108 func (w *DiskWAL) Segments() (first, last int, err error) { 109 return prometheusWAL.Segments(w.wal.Dir()) 110 } 111 112 func (w *DiskWAL) Replay( 113 checkpointFn func(tries []*trie.MTrie) error, 114 updateFn func(update *ledger.TrieUpdate) error, 115 deleteFn func(ledger.RootHash) error, 116 ) error { 117 from, to, err := w.Segments() 118 if err != nil { 119 return fmt.Errorf("could not find segments: %w", err) 120 } 121 err = w.replay(from, to, checkpointFn, updateFn, deleteFn, true) 122 if err != nil { 123 return fmt.Errorf("could not replay segments [%v:%v]: %w", from, to, err) 124 } 125 return nil 126 } 127 128 func (w *DiskWAL) ReplayLogsOnly( 129 checkpointFn func(tries []*trie.MTrie) error, 130 updateFn func(update *ledger.TrieUpdate) error, 131 deleteFn func(rootHash ledger.RootHash) error, 132 ) error { 133 from, to, err := w.Segments() 134 if err != nil { 135 return fmt.Errorf("could not find segments: %w", err) 136 } 137 err = w.replay(from, to, checkpointFn, updateFn, deleteFn, false) 138 if err != nil { 139 return fmt.Errorf("could not replay WAL only for segments [%v:%v]: %w", from, to, err) 140 } 141 return nil 142 } 143 144 func (w *DiskWAL) replay( 145 from, to int, 146 checkpointFn func(tries []*trie.MTrie) error, 147 updateFn func(update *ledger.TrieUpdate) error, 148 deleteFn func(rootHash ledger.RootHash) error, 149 useCheckpoints bool, 150 ) error { 151 152 w.log.Info().Msgf("loading checkpoint with WAL from %d to %d", from, to) 153 154 if to < from { 155 return fmt.Errorf("end of range cannot be smaller than beginning") 156 } 157 158 loadedCheckpoint := -1 159 startSegment := from 160 checkpointLoaded := false 161 162 checkpointer, err := w.NewCheckpointer() 163 if err != nil { 164 return fmt.Errorf("cannot create checkpointer: %w", err) 165 } 166 167 if useCheckpoints { 168 allCheckpoints, err := checkpointer.Checkpoints() 169 if err != nil { 170 return fmt.Errorf("cannot get list of checkpoints: %w", err) 171 } 172 173 var availableCheckpoints []int 174 175 // if there are no checkpoints already, don't bother 176 if len(allCheckpoints) > 0 { 177 // from-1 to account for checkpoints connected to segments, ie. checkpoint 8 if replaying segments 9-12 178 availableCheckpoints = getPossibleCheckpoints(allCheckpoints, from-1, to) 179 } 180 181 for len(availableCheckpoints) > 0 { 182 // as long as there are checkpoints to try, we always try with the last checkpoint file, since 183 // it allows us to load less segments. 184 latestCheckpoint := availableCheckpoints[len(availableCheckpoints)-1] 185 186 w.log.Info().Int("checkpoint", latestCheckpoint).Msg("loading checkpoint") 187 188 forestSequencing, err := checkpointer.LoadCheckpoint(latestCheckpoint) 189 if err != nil { 190 w.log.Warn().Int("checkpoint", latestCheckpoint).Err(err). 191 Msg("checkpoint loading failed") 192 193 availableCheckpoints = availableCheckpoints[:len(availableCheckpoints)-1] 194 continue 195 } 196 197 w.log.Info().Int("checkpoint", latestCheckpoint).Msg("checkpoint loaded") 198 199 err = checkpointFn(forestSequencing) 200 if err != nil { 201 return fmt.Errorf("error while handling checkpoint: %w", err) 202 } 203 loadedCheckpoint = latestCheckpoint 204 checkpointLoaded = true 205 break 206 } 207 208 if loadedCheckpoint != -1 && loadedCheckpoint == to { 209 return nil 210 } 211 212 if loadedCheckpoint >= 0 { 213 startSegment = loadedCheckpoint + 1 214 } 215 } 216 217 if loadedCheckpoint == -1 && startSegment == 0 { 218 hasRootCheckpoint, err := checkpointer.HasRootCheckpoint() 219 if err != nil { 220 return fmt.Errorf("cannot check root checkpoint existence: %w", err) 221 } 222 if hasRootCheckpoint { 223 w.log.Info().Msgf("loading root checkpoint") 224 225 flattenedForest, err := checkpointer.LoadRootCheckpoint() 226 if err != nil { 227 return fmt.Errorf("cannot load root checkpoint: %w", err) 228 } 229 err = checkpointFn(flattenedForest) 230 if err != nil { 231 return fmt.Errorf("error while handling root checkpoint: %w", err) 232 } 233 234 w.log.Info().Msgf("root checkpoint loaded") 235 checkpointLoaded = true 236 } 237 } 238 239 w.log.Info(). 240 Bool("checkpoint_loaded", checkpointLoaded). 241 Int("loaded_checkpoint", loadedCheckpoint). 242 Msgf("replaying segments from %d to %d", startSegment, to) 243 244 sr, err := prometheusWAL.NewSegmentsRangeReader(prometheusWAL.SegmentRange{ 245 Dir: w.wal.Dir(), 246 First: startSegment, 247 Last: to, 248 }) 249 if err != nil { 250 return fmt.Errorf("cannot create segment reader: %w", err) 251 } 252 253 reader := prometheusWAL.NewReader(sr) 254 255 defer sr.Close() 256 257 for reader.Next() { 258 record := reader.Record() 259 operation, rootHash, update, err := Decode(record) 260 if err != nil { 261 return fmt.Errorf("cannot decode LedgerWAL record: %w", err) 262 } 263 264 switch operation { 265 case WALUpdate: 266 err = updateFn(update) 267 if err != nil { 268 return fmt.Errorf("error while processing LedgerWAL update: %w", err) 269 } 270 case WALDelete: 271 err = deleteFn(rootHash) 272 if err != nil { 273 return fmt.Errorf("error while processing LedgerWAL deletion: %w", err) 274 } 275 } 276 277 err = reader.Err() 278 if err != nil { 279 return fmt.Errorf("cannot read LedgerWAL: %w", err) 280 } 281 } 282 283 w.log.Info().Msgf("finished loading checkpoint and replaying WAL from %d to %d", from, to) 284 285 return nil 286 } 287 288 func getPossibleCheckpoints(allCheckpoints []int, from, to int) []int { 289 // list of checkpoints is sorted 290 indexFrom := sort.SearchInts(allCheckpoints, from) 291 indexTo := sort.SearchInts(allCheckpoints, to) 292 293 // all checkpoints are earlier, return last one 294 if indexTo == len(allCheckpoints) { 295 return allCheckpoints[indexFrom:indexTo] 296 } 297 298 // exact match 299 if allCheckpoints[indexTo] == to { 300 return allCheckpoints[indexFrom : indexTo+1] 301 } 302 303 // earliest checkpoint from list doesn't match, index 0 means no match at all 304 if indexTo == 0 { 305 return nil 306 } 307 308 return allCheckpoints[indexFrom:indexTo] 309 } 310 311 // NewCheckpointer returns a Checkpointer for this WAL 312 func (w *DiskWAL) NewCheckpointer() (*Checkpointer, error) { 313 return NewCheckpointer(w, w.pathByteSize, w.forestCapacity), nil 314 } 315 316 func (w *DiskWAL) Ready() <-chan struct{} { 317 ready := make(chan struct{}) 318 close(ready) 319 return ready 320 } 321 322 // Done implements interface module.ReadyDoneAware 323 // it closes all the open write-ahead log files. 324 func (w *DiskWAL) Done() <-chan struct{} { 325 err := w.wal.Close() 326 if err != nil { 327 w.log.Err(err).Msg("error while closing WAL") 328 } 329 done := make(chan struct{}) 330 close(done) 331 return done 332 } 333 334 type LedgerWAL interface { 335 module.ReadyDoneAware 336 337 NewCheckpointer() (*Checkpointer, error) 338 PauseRecord() 339 UnpauseRecord() 340 RecordUpdate(update *ledger.TrieUpdate) (int, bool, error) 341 RecordDelete(rootHash ledger.RootHash) error 342 ReplayOnForest(forest *mtrie.Forest) error 343 Segments() (first, last int, err error) 344 Replay( 345 checkpointFn func(tries []*trie.MTrie) error, 346 updateFn func(update *ledger.TrieUpdate) error, 347 deleteFn func(ledger.RootHash) error, 348 ) error 349 ReplayLogsOnly( 350 checkpointFn func(tries []*trie.MTrie) error, 351 updateFn func(update *ledger.TrieUpdate) error, 352 deleteFn func(rootHash ledger.RootHash) error, 353 ) error 354 }