github.com/ari-anchor/sei-tendermint@v0.0.0-20230519144642-dc826b7b56bb/internal/consensus/wal.go (about)

     1  package consensus
     2  
     3  import (
     4  	"context"
     5  	"encoding/binary"
     6  	"errors"
     7  	"fmt"
     8  	"hash/crc32"
     9  	"io"
    10  	"path/filepath"
    11  	"time"
    12  
    13  	"github.com/gogo/protobuf/proto"
    14  
    15  	"github.com/ari-anchor/sei-tendermint/internal/jsontypes"
    16  	auto "github.com/ari-anchor/sei-tendermint/internal/libs/autofile"
    17  	"github.com/ari-anchor/sei-tendermint/libs/log"
    18  	tmos "github.com/ari-anchor/sei-tendermint/libs/os"
    19  	"github.com/ari-anchor/sei-tendermint/libs/service"
    20  	tmtime "github.com/ari-anchor/sei-tendermint/libs/time"
    21  	tmcons "github.com/ari-anchor/sei-tendermint/proto/tendermint/consensus"
    22  )
    23  
    24  const (
    25  	// time.Time + max consensus msg size
    26  	maxMsgSizeBytes = maxMsgSize + 24
    27  
    28  	// how often the WAL should be sync'd during period sync'ing
    29  	walDefaultFlushInterval = 2 * time.Second
    30  )
    31  
    32  //--------------------------------------------------------
    33  // types and functions for savings consensus messages
    34  
    35  // TimedWALMessage wraps WALMessage and adds Time for debugging purposes.
    36  type TimedWALMessage struct {
    37  	Time time.Time  `json:"time"`
    38  	Msg  WALMessage `json:"msg"`
    39  }
    40  
    41  // EndHeightMessage marks the end of the given height inside WAL.
    42  // @internal used by scripts/wal2json util.
    43  type EndHeightMessage struct {
    44  	Height int64 `json:"height,string"`
    45  }
    46  
    47  func (EndHeightMessage) TypeTag() string { return "tendermint/wal/EndHeightMessage" }
    48  
    49  type WALMessage interface{}
    50  
    51  func init() {
    52  	jsontypes.MustRegister(msgInfo{})
    53  	jsontypes.MustRegister(timeoutInfo{})
    54  	jsontypes.MustRegister(EndHeightMessage{})
    55  }
    56  
    57  //--------------------------------------------------------
    58  // Simple write-ahead logger
    59  
    60  // WAL is an interface for any write-ahead logger.
    61  type WAL interface {
    62  	Write(WALMessage) error
    63  	WriteSync(WALMessage) error
    64  	FlushAndSync() error
    65  
    66  	SearchForEndHeight(height int64, options *WALSearchOptions) (rd io.ReadCloser, found bool, err error)
    67  
    68  	// service methods
    69  	Start(context.Context) error
    70  	Stop()
    71  	Wait()
    72  }
    73  
    74  // Write ahead logger writes msgs to disk before they are processed.
    75  // Can be used for crash-recovery and deterministic replay.
    76  // TODO: currently the wal is overwritten during replay catchup, give it a mode
    77  // so it's either reading or appending - must read to end to start appending
    78  // again.
    79  type BaseWAL struct {
    80  	service.BaseService
    81  	logger log.Logger
    82  
    83  	group *auto.Group
    84  
    85  	enc *WALEncoder
    86  
    87  	flushTicker   *time.Ticker
    88  	flushInterval time.Duration
    89  }
    90  
    91  var _ WAL = &BaseWAL{}
    92  
    93  // NewWAL returns a new write-ahead logger based on `baseWAL`, which implements
    94  // WAL. It's flushed and synced to disk every 2s and once when stopped.
    95  func NewWAL(ctx context.Context, logger log.Logger, walFile string, groupOptions ...func(*auto.Group)) (*BaseWAL, error) {
    96  	err := tmos.EnsureDir(filepath.Dir(walFile), 0700)
    97  	if err != nil {
    98  		return nil, fmt.Errorf("failed to ensure WAL directory is in place: %w", err)
    99  	}
   100  
   101  	group, err := auto.OpenGroup(ctx, logger, walFile, groupOptions...)
   102  	if err != nil {
   103  		return nil, err
   104  	}
   105  	wal := &BaseWAL{
   106  		logger:        logger,
   107  		group:         group,
   108  		enc:           NewWALEncoder(group),
   109  		flushInterval: walDefaultFlushInterval,
   110  	}
   111  	wal.BaseService = *service.NewBaseService(logger, "baseWAL", wal)
   112  	return wal, nil
   113  }
   114  
   115  // SetFlushInterval allows us to override the periodic flush interval for the WAL.
   116  func (wal *BaseWAL) SetFlushInterval(i time.Duration) {
   117  	wal.flushInterval = i
   118  }
   119  
   120  func (wal *BaseWAL) Group() *auto.Group {
   121  	return wal.group
   122  }
   123  
   124  func (wal *BaseWAL) OnStart(ctx context.Context) error {
   125  	size, err := wal.group.Head.Size()
   126  	if err != nil {
   127  		return err
   128  	} else if size == 0 {
   129  		if err := wal.WriteSync(EndHeightMessage{0}); err != nil {
   130  			return err
   131  		}
   132  	}
   133  	err = wal.group.Start(ctx)
   134  	if err != nil {
   135  		return err
   136  	}
   137  	wal.flushTicker = time.NewTicker(wal.flushInterval)
   138  	go wal.processFlushTicks(ctx)
   139  	return nil
   140  }
   141  
   142  func (wal *BaseWAL) processFlushTicks(ctx context.Context) {
   143  	for {
   144  		select {
   145  		case <-wal.flushTicker.C:
   146  			if err := wal.FlushAndSync(); err != nil {
   147  				wal.logger.Error("Periodic WAL flush failed", "err", err)
   148  			}
   149  		case <-ctx.Done():
   150  			return
   151  		}
   152  	}
   153  }
   154  
   155  // FlushAndSync flushes and fsync's the underlying group's data to disk.
   156  // See auto#FlushAndSync
   157  func (wal *BaseWAL) FlushAndSync() error {
   158  	return wal.group.FlushAndSync()
   159  }
   160  
   161  // Stop the underlying autofile group.
   162  // Use Wait() to ensure it's finished shutting down
   163  // before cleaning up files.
   164  func (wal *BaseWAL) OnStop() {
   165  	wal.flushTicker.Stop()
   166  	if err := wal.FlushAndSync(); err != nil {
   167  		wal.logger.Error("error on flush data to disk", "error", err)
   168  	}
   169  	wal.group.Stop()
   170  	wal.group.Close()
   171  }
   172  
   173  // Wait for the underlying autofile group to finish shutting down
   174  // so it's safe to cleanup files.
   175  func (wal *BaseWAL) Wait() {
   176  	if wal.IsRunning() {
   177  		wal.BaseService.Wait()
   178  	}
   179  	if wal.group.IsRunning() {
   180  		wal.group.Wait()
   181  	}
   182  }
   183  
   184  // Write is called in newStep and for each receive on the
   185  // peerMsgQueue and the timeoutTicker.
   186  // NOTE: does not call fsync()
   187  func (wal *BaseWAL) Write(msg WALMessage) error {
   188  	if wal == nil {
   189  		return nil
   190  	}
   191  
   192  	if err := wal.enc.Encode(&TimedWALMessage{tmtime.Now(), msg}); err != nil {
   193  		wal.logger.Error("error writing msg to consensus wal. WARNING: recover may not be possible for the current height",
   194  			"err", err, "msg", msg)
   195  		return err
   196  	}
   197  
   198  	return nil
   199  }
   200  
   201  // WriteSync is called when we receive a msg from ourselves
   202  // so that we write to disk before sending signed messages.
   203  // NOTE: calls fsync()
   204  func (wal *BaseWAL) WriteSync(msg WALMessage) error {
   205  	if wal == nil {
   206  		return nil
   207  	}
   208  
   209  	if err := wal.Write(msg); err != nil {
   210  		return err
   211  	}
   212  
   213  	if err := wal.FlushAndSync(); err != nil {
   214  		wal.logger.Error(`WriteSync failed to flush consensus wal.
   215  		WARNING: may result in creating alternative proposals / votes for the current height iff the node restarted`,
   216  			"err", err)
   217  		return err
   218  	}
   219  
   220  	return nil
   221  }
   222  
   223  // WALSearchOptions are optional arguments to SearchForEndHeight.
   224  type WALSearchOptions struct {
   225  	// IgnoreDataCorruptionErrors set to true will result in skipping data corruption errors.
   226  	IgnoreDataCorruptionErrors bool
   227  }
   228  
   229  // SearchForEndHeight searches for the EndHeightMessage with the given height
   230  // and returns an auto.GroupReader, whenever it was found or not and an error.
   231  // Group reader will be nil if found equals false.
   232  //
   233  // CONTRACT: caller must close group reader.
   234  func (wal *BaseWAL) SearchForEndHeight(
   235  	height int64,
   236  	options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) {
   237  	var (
   238  		msg *TimedWALMessage
   239  		gr  *auto.GroupReader
   240  	)
   241  	lastHeightFound := int64(-1)
   242  
   243  	// NOTE: starting from the last file in the group because we're usually
   244  	// searching for the last height. See replay.go
   245  	min, max := wal.group.MinIndex(), wal.group.MaxIndex()
   246  	wal.logger.Info("Searching for height", "height", height, "min", min, "max", max)
   247  	for index := max; index >= min; index-- {
   248  		gr, err = wal.group.NewReader(index)
   249  		if err != nil {
   250  			return nil, false, err
   251  		}
   252  
   253  		dec := NewWALDecoder(gr)
   254  		for {
   255  			msg, err = dec.Decode()
   256  			if err == io.EOF {
   257  				// OPTIMISATION: no need to look for height in older files if we've seen h < height
   258  				if lastHeightFound > 0 && lastHeightFound < height {
   259  					gr.Close()
   260  					return nil, false, nil
   261  				}
   262  				// check next file
   263  				break
   264  			}
   265  			if options.IgnoreDataCorruptionErrors && IsDataCorruptionError(err) {
   266  				wal.logger.Error("Corrupted entry. Skipping...", "err", err)
   267  				// do nothing
   268  				continue
   269  			} else if err != nil {
   270  				gr.Close()
   271  				return nil, false, err
   272  			}
   273  
   274  			if m, ok := msg.Msg.(EndHeightMessage); ok {
   275  				lastHeightFound = m.Height
   276  				if m.Height == height { // found
   277  					wal.logger.Info("Found", "height", height, "index", index)
   278  					return gr, true, nil
   279  				}
   280  			}
   281  		}
   282  		gr.Close()
   283  	}
   284  
   285  	return nil, false, nil
   286  }
   287  
   288  // A WALEncoder writes custom-encoded WAL messages to an output stream.
   289  //
   290  // Format: 4 bytes CRC sum + 4 bytes length + arbitrary-length value
   291  type WALEncoder struct {
   292  	wr io.Writer
   293  }
   294  
   295  // NewWALEncoder returns a new encoder that writes to wr.
   296  func NewWALEncoder(wr io.Writer) *WALEncoder {
   297  	return &WALEncoder{wr}
   298  }
   299  
   300  // Encode writes the custom encoding of v to the stream. It returns an error if
   301  // the encoded size of v is greater than 4MB. Any error encountered
   302  // during the write is also returned.
   303  func (enc *WALEncoder) Encode(v *TimedWALMessage) error {
   304  	pbMsg, err := WALToProto(v.Msg)
   305  	if err != nil {
   306  		return err
   307  	}
   308  	pv := tmcons.TimedWALMessage{
   309  		Time: v.Time,
   310  		Msg:  pbMsg,
   311  	}
   312  
   313  	data, err := proto.Marshal(&pv)
   314  	if err != nil {
   315  		panic(fmt.Errorf("encode timed wall message failure: %w", err))
   316  	}
   317  
   318  	crc := crc32.Checksum(data, crc32c)
   319  	length := uint32(len(data))
   320  	if length > maxMsgSizeBytes {
   321  		return fmt.Errorf("msg is too big: %d bytes, max: %d bytes", length, maxMsgSizeBytes)
   322  	}
   323  	totalLength := 8 + int(length)
   324  
   325  	msg := make([]byte, totalLength)
   326  	binary.BigEndian.PutUint32(msg[0:4], crc)
   327  	binary.BigEndian.PutUint32(msg[4:8], length)
   328  	copy(msg[8:], data)
   329  
   330  	_, err = enc.wr.Write(msg)
   331  	return err
   332  }
   333  
   334  // IsDataCorruptionError returns true if data has been corrupted inside WAL.
   335  func IsDataCorruptionError(err error) bool {
   336  	_, ok := err.(DataCorruptionError)
   337  	return ok
   338  }
   339  
   340  // DataCorruptionError is an error that occures if data on disk was corrupted.
   341  type DataCorruptionError struct {
   342  	cause error
   343  }
   344  
   345  func (e DataCorruptionError) Error() string {
   346  	return fmt.Sprintf("DataCorruptionError[%v]", e.cause)
   347  }
   348  
   349  func (e DataCorruptionError) Cause() error {
   350  	return e.cause
   351  }
   352  
   353  // A WALDecoder reads and decodes custom-encoded WAL messages from an input
   354  // stream. See WALEncoder for the format used.
   355  //
   356  // It will also compare the checksums and make sure data size is equal to the
   357  // length from the header. If that is not the case, error will be returned.
   358  type WALDecoder struct {
   359  	rd io.Reader
   360  }
   361  
   362  // NewWALDecoder returns a new decoder that reads from rd.
   363  func NewWALDecoder(rd io.Reader) *WALDecoder {
   364  	return &WALDecoder{rd}
   365  }
   366  
   367  // Decode reads the next custom-encoded value from its reader and returns it.
   368  func (dec *WALDecoder) Decode() (*TimedWALMessage, error) {
   369  	b := make([]byte, 4)
   370  
   371  	_, err := dec.rd.Read(b)
   372  	if errors.Is(err, io.EOF) {
   373  		return nil, err
   374  	}
   375  	if err != nil {
   376  		return nil, DataCorruptionError{fmt.Errorf("failed to read checksum: %w", err)}
   377  	}
   378  	crc := binary.BigEndian.Uint32(b)
   379  
   380  	b = make([]byte, 4)
   381  	_, err = dec.rd.Read(b)
   382  	if err != nil {
   383  		return nil, DataCorruptionError{fmt.Errorf("failed to read length: %w", err)}
   384  	}
   385  	length := binary.BigEndian.Uint32(b)
   386  
   387  	if length > maxMsgSizeBytes {
   388  		return nil, DataCorruptionError{fmt.Errorf(
   389  			"length %d exceeded maximum possible value of %d bytes",
   390  			length,
   391  			maxMsgSizeBytes)}
   392  	}
   393  
   394  	data := make([]byte, length)
   395  	n, err := dec.rd.Read(data)
   396  	if err != nil {
   397  		return nil, DataCorruptionError{fmt.Errorf("failed to read data: %v (read: %d, wanted: %d)", err, n, length)}
   398  	}
   399  
   400  	// check checksum before decoding data
   401  	actualCRC := crc32.Checksum(data, crc32c)
   402  	if actualCRC != crc {
   403  		return nil, DataCorruptionError{fmt.Errorf("checksums do not match: read: %v, actual: %v", crc, actualCRC)}
   404  	}
   405  
   406  	var res = new(tmcons.TimedWALMessage)
   407  	err = proto.Unmarshal(data, res)
   408  	if err != nil {
   409  		return nil, DataCorruptionError{fmt.Errorf("failed to decode data: %w", err)}
   410  	}
   411  
   412  	walMsg, err := WALFromProto(res.Msg)
   413  	if err != nil {
   414  		return nil, DataCorruptionError{fmt.Errorf("failed to convert from proto: %w", err)}
   415  	}
   416  	tMsgWal := &TimedWALMessage{
   417  		Time: res.Time,
   418  		Msg:  walMsg,
   419  	}
   420  
   421  	return tMsgWal, err
   422  }
   423  
   424  type nilWAL struct{}
   425  
   426  var _ WAL = nilWAL{}
   427  
   428  func (nilWAL) Write(m WALMessage) error     { return nil }
   429  func (nilWAL) WriteSync(m WALMessage) error { return nil }
   430  func (nilWAL) FlushAndSync() error          { return nil }
   431  func (nilWAL) SearchForEndHeight(height int64, options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) {
   432  	return nil, false, nil
   433  }
   434  func (nilWAL) Start(context.Context) error { return nil }
   435  func (nilWAL) Stop()                       {}
   436  func (nilWAL) Wait()                       {}