github.com/koko1123/flow-go-1@v0.29.6/ledger/complete/wal/wal.go (about)

     1  package wal
     2  
     3  import (
     4  	"fmt"
     5  	"sort"
     6  
     7  	prometheusWAL "github.com/m4ksio/wal/wal"
     8  	"github.com/prometheus/client_golang/prometheus"
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/koko1123/flow-go-1/ledger"
    12  	"github.com/koko1123/flow-go-1/ledger/complete/mtrie"
    13  	"github.com/koko1123/flow-go-1/ledger/complete/mtrie/trie"
    14  	"github.com/koko1123/flow-go-1/module"
    15  )
    16  
    17  const SegmentSize = 32 * 1024 * 1024 // 32 MB
    18  
    19  type DiskWAL struct {
    20  	wal            *prometheusWAL.WAL
    21  	paused         bool
    22  	forestCapacity int
    23  	pathByteSize   int
    24  	log            zerolog.Logger
    25  	dir            string
    26  }
    27  
    28  // TODO use real logger and metrics, but that would require passing them to Trie storage
    29  func NewDiskWAL(logger zerolog.Logger, reg prometheus.Registerer, metrics module.WALMetrics, dir string, forestCapacity int, pathByteSize int, segmentSize int) (*DiskWAL, error) {
    30  	w, err := prometheusWAL.NewSize(logger, reg, dir, segmentSize, false)
    31  	if err != nil {
    32  		return nil, err
    33  	}
    34  	return &DiskWAL{
    35  		wal:            w,
    36  		paused:         false,
    37  		forestCapacity: forestCapacity,
    38  		pathByteSize:   pathByteSize,
    39  		log:            logger.With().Str("ledger_mod", "diskwal").Logger(),
    40  		dir:            dir,
    41  	}, nil
    42  }
    43  
    44  func (w *DiskWAL) PauseRecord() {
    45  	w.paused = true
    46  }
    47  
    48  func (w *DiskWAL) UnpauseRecord() {
    49  	w.paused = false
    50  }
    51  
    52  // RecordUpdate writes the trie update to the write ahead log on disk.
    53  // if write ahead logging is not paused, it returns the file num (write ahead log) that the trie update was written to.
    54  // if write ahead logging is enabled, the second returned value is false, otherwise it's true, meaning WAL is disabled.
    55  func (w *DiskWAL) RecordUpdate(update *ledger.TrieUpdate) (segmentNum int, skipped bool, err error) {
    56  	if w.paused {
    57  		return 0, true, nil
    58  	}
    59  
    60  	bytes := EncodeUpdate(update)
    61  
    62  	locations, err := w.wal.Log(bytes)
    63  
    64  	if err != nil {
    65  		return 0, false, fmt.Errorf("error while recording update in LedgerWAL: %w", err)
    66  	}
    67  	if len(locations) != 1 {
    68  		return 0, false, fmt.Errorf("error while recording update in LedgerWAL: got %d location, expect 1 location", len(locations))
    69  	}
    70  
    71  	return locations[0].Segment, false, nil
    72  }
    73  
    74  func (w *DiskWAL) RecordDelete(rootHash ledger.RootHash) error {
    75  	if w.paused {
    76  		return nil
    77  	}
    78  
    79  	bytes := EncodeDelete(rootHash)
    80  
    81  	_, err := w.wal.Log(bytes)
    82  
    83  	if err != nil {
    84  		return fmt.Errorf("error while recording delete in LedgerWAL: %w", err)
    85  	}
    86  	return nil
    87  }
    88  
    89  func (w *DiskWAL) ReplayOnForest(forest *mtrie.Forest) error {
    90  	return w.Replay(
    91  		func(tries []*trie.MTrie) error {
    92  			err := forest.AddTries(tries)
    93  			if err != nil {
    94  				return fmt.Errorf("adding rebuilt tries to forest failed: %w", err)
    95  			}
    96  			return nil
    97  		},
    98  		func(update *ledger.TrieUpdate) error {
    99  			_, err := forest.Update(update)
   100  			return err
   101  		},
   102  		func(rootHash ledger.RootHash) error {
   103  			return nil
   104  		},
   105  	)
   106  }
   107  
   108  func (w *DiskWAL) Segments() (first, last int, err error) {
   109  	return prometheusWAL.Segments(w.wal.Dir())
   110  }
   111  
   112  func (w *DiskWAL) Replay(
   113  	checkpointFn func(tries []*trie.MTrie) error,
   114  	updateFn func(update *ledger.TrieUpdate) error,
   115  	deleteFn func(ledger.RootHash) error,
   116  ) error {
   117  	from, to, err := w.Segments()
   118  	if err != nil {
   119  		return fmt.Errorf("could not find segments: %w", err)
   120  	}
   121  	err = w.replay(from, to, checkpointFn, updateFn, deleteFn, true)
   122  	if err != nil {
   123  		return fmt.Errorf("could not replay segments [%v:%v]: %w", from, to, err)
   124  	}
   125  	return nil
   126  }
   127  
   128  func (w *DiskWAL) ReplayLogsOnly(
   129  	checkpointFn func(tries []*trie.MTrie) error,
   130  	updateFn func(update *ledger.TrieUpdate) error,
   131  	deleteFn func(rootHash ledger.RootHash) error,
   132  ) error {
   133  	from, to, err := w.Segments()
   134  	if err != nil {
   135  		return fmt.Errorf("could not find segments: %w", err)
   136  	}
   137  	err = w.replay(from, to, checkpointFn, updateFn, deleteFn, false)
   138  	if err != nil {
   139  		return fmt.Errorf("could not replay WAL only for segments [%v:%v]: %w", from, to, err)
   140  	}
   141  	return nil
   142  }
   143  
   144  func (w *DiskWAL) replay(
   145  	from, to int,
   146  	checkpointFn func(tries []*trie.MTrie) error,
   147  	updateFn func(update *ledger.TrieUpdate) error,
   148  	deleteFn func(rootHash ledger.RootHash) error,
   149  	useCheckpoints bool,
   150  ) error {
   151  
   152  	w.log.Info().Msgf("loading checkpoint with WAL from %d to %d", from, to)
   153  
   154  	if to < from {
   155  		return fmt.Errorf("end of range cannot be smaller than beginning")
   156  	}
   157  
   158  	loadedCheckpoint := -1
   159  	startSegment := from
   160  	checkpointLoaded := false
   161  
   162  	checkpointer, err := w.NewCheckpointer()
   163  	if err != nil {
   164  		return fmt.Errorf("cannot create checkpointer: %w", err)
   165  	}
   166  
   167  	if useCheckpoints {
   168  		allCheckpoints, err := checkpointer.Checkpoints()
   169  		if err != nil {
   170  			return fmt.Errorf("cannot get list of checkpoints: %w", err)
   171  		}
   172  
   173  		var availableCheckpoints []int
   174  
   175  		// if there are no checkpoints already, don't bother
   176  		if len(allCheckpoints) > 0 {
   177  			// from-1 to account for checkpoints connected to segments, ie. checkpoint 8 if replaying segments 9-12
   178  			availableCheckpoints = getPossibleCheckpoints(allCheckpoints, from-1, to)
   179  		}
   180  
   181  		for len(availableCheckpoints) > 0 {
   182  			// as long as there are checkpoints to try, we always try with the last checkpoint file, since
   183  			// it allows us to load less segments.
   184  			latestCheckpoint := availableCheckpoints[len(availableCheckpoints)-1]
   185  
   186  			w.log.Info().Int("checkpoint", latestCheckpoint).Msg("loading checkpoint")
   187  
   188  			forestSequencing, err := checkpointer.LoadCheckpoint(latestCheckpoint)
   189  			if err != nil {
   190  				w.log.Warn().Int("checkpoint", latestCheckpoint).Err(err).
   191  					Msg("checkpoint loading failed")
   192  
   193  				availableCheckpoints = availableCheckpoints[:len(availableCheckpoints)-1]
   194  				continue
   195  			}
   196  
   197  			w.log.Info().Int("checkpoint", latestCheckpoint).Msg("checkpoint loaded")
   198  
   199  			err = checkpointFn(forestSequencing)
   200  			if err != nil {
   201  				return fmt.Errorf("error while handling checkpoint: %w", err)
   202  			}
   203  			loadedCheckpoint = latestCheckpoint
   204  			checkpointLoaded = true
   205  			break
   206  		}
   207  
   208  		if loadedCheckpoint != -1 && loadedCheckpoint == to {
   209  			return nil
   210  		}
   211  
   212  		if loadedCheckpoint >= 0 {
   213  			startSegment = loadedCheckpoint + 1
   214  		}
   215  	}
   216  
   217  	if loadedCheckpoint == -1 && startSegment == 0 {
   218  		hasRootCheckpoint, err := checkpointer.HasRootCheckpoint()
   219  		if err != nil {
   220  			return fmt.Errorf("cannot check root checkpoint existence: %w", err)
   221  		}
   222  		if hasRootCheckpoint {
   223  			w.log.Info().Msgf("loading root checkpoint")
   224  
   225  			flattenedForest, err := checkpointer.LoadRootCheckpoint()
   226  			if err != nil {
   227  				return fmt.Errorf("cannot load root checkpoint: %w", err)
   228  			}
   229  			err = checkpointFn(flattenedForest)
   230  			if err != nil {
   231  				return fmt.Errorf("error while handling root checkpoint: %w", err)
   232  			}
   233  
   234  			w.log.Info().Msgf("root checkpoint loaded")
   235  			checkpointLoaded = true
   236  		}
   237  	}
   238  
   239  	w.log.Info().
   240  		Bool("checkpoint_loaded", checkpointLoaded).
   241  		Int("loaded_checkpoint", loadedCheckpoint).
   242  		Msgf("replaying segments from %d to %d", startSegment, to)
   243  
   244  	sr, err := prometheusWAL.NewSegmentsRangeReader(prometheusWAL.SegmentRange{
   245  		Dir:   w.wal.Dir(),
   246  		First: startSegment,
   247  		Last:  to,
   248  	})
   249  	if err != nil {
   250  		return fmt.Errorf("cannot create segment reader: %w", err)
   251  	}
   252  
   253  	reader := prometheusWAL.NewReader(sr)
   254  
   255  	defer sr.Close()
   256  
   257  	for reader.Next() {
   258  		record := reader.Record()
   259  		operation, rootHash, update, err := Decode(record)
   260  		if err != nil {
   261  			return fmt.Errorf("cannot decode LedgerWAL record: %w", err)
   262  		}
   263  
   264  		switch operation {
   265  		case WALUpdate:
   266  			err = updateFn(update)
   267  			if err != nil {
   268  				return fmt.Errorf("error while processing LedgerWAL update: %w", err)
   269  			}
   270  		case WALDelete:
   271  			err = deleteFn(rootHash)
   272  			if err != nil {
   273  				return fmt.Errorf("error while processing LedgerWAL deletion: %w", err)
   274  			}
   275  		}
   276  
   277  		err = reader.Err()
   278  		if err != nil {
   279  			return fmt.Errorf("cannot read LedgerWAL: %w", err)
   280  		}
   281  	}
   282  
   283  	w.log.Info().Msgf("finished loading checkpoint and replaying WAL from %d to %d", from, to)
   284  
   285  	return nil
   286  }
   287  
   288  func getPossibleCheckpoints(allCheckpoints []int, from, to int) []int {
   289  	// list of checkpoints is sorted
   290  	indexFrom := sort.SearchInts(allCheckpoints, from)
   291  	indexTo := sort.SearchInts(allCheckpoints, to)
   292  
   293  	// all checkpoints are earlier, return last one
   294  	if indexTo == len(allCheckpoints) {
   295  		return allCheckpoints[indexFrom:indexTo]
   296  	}
   297  
   298  	// exact match
   299  	if allCheckpoints[indexTo] == to {
   300  		return allCheckpoints[indexFrom : indexTo+1]
   301  	}
   302  
   303  	// earliest checkpoint from list doesn't match, index 0 means no match at all
   304  	if indexTo == 0 {
   305  		return nil
   306  	}
   307  
   308  	return allCheckpoints[indexFrom:indexTo]
   309  }
   310  
   311  // NewCheckpointer returns a Checkpointer for this WAL
   312  func (w *DiskWAL) NewCheckpointer() (*Checkpointer, error) {
   313  	return NewCheckpointer(w, w.pathByteSize, w.forestCapacity), nil
   314  }
   315  
   316  func (w *DiskWAL) Ready() <-chan struct{} {
   317  	ready := make(chan struct{})
   318  	close(ready)
   319  	return ready
   320  }
   321  
   322  // Done implements interface module.ReadyDoneAware
   323  // it closes all the open write-ahead log files.
   324  func (w *DiskWAL) Done() <-chan struct{} {
   325  	err := w.wal.Close()
   326  	if err != nil {
   327  		w.log.Err(err).Msg("error while closing WAL")
   328  	}
   329  	done := make(chan struct{})
   330  	close(done)
   331  	return done
   332  }
   333  
   334  type LedgerWAL interface {
   335  	module.ReadyDoneAware
   336  
   337  	NewCheckpointer() (*Checkpointer, error)
   338  	PauseRecord()
   339  	UnpauseRecord()
   340  	RecordUpdate(update *ledger.TrieUpdate) (int, bool, error)
   341  	RecordDelete(rootHash ledger.RootHash) error
   342  	ReplayOnForest(forest *mtrie.Forest) error
   343  	Segments() (first, last int, err error)
   344  	Replay(
   345  		checkpointFn func(tries []*trie.MTrie) error,
   346  		updateFn func(update *ledger.TrieUpdate) error,
   347  		deleteFn func(ledger.RootHash) error,
   348  	) error
   349  	ReplayLogsOnly(
   350  		checkpointFn func(tries []*trie.MTrie) error,
   351  		updateFn func(update *ledger.TrieUpdate) error,
   352  		deleteFn func(rootHash ledger.RootHash) error,
   353  	) error
   354  }