github.com/Finschia/ostracon@v1.1.5/consensus/wal.go (about)

     1  package consensus
     2  
     3  import (
     4  	"encoding/binary"
     5  	"errors"
     6  	"fmt"
     7  	"hash/crc32"
     8  	"io"
     9  	"path/filepath"
    10  	"time"
    11  
    12  	"github.com/gogo/protobuf/proto"
    13  
    14  	tmcons "github.com/tendermint/tendermint/proto/tendermint/consensus"
    15  
    16  	auto "github.com/Finschia/ostracon/libs/autofile"
    17  	tmjson "github.com/Finschia/ostracon/libs/json"
    18  	"github.com/Finschia/ostracon/libs/log"
    19  	tmos "github.com/Finschia/ostracon/libs/os"
    20  	"github.com/Finschia/ostracon/libs/service"
    21  	tmtime "github.com/Finschia/ostracon/types/time"
    22  )
    23  
    24  const (
    25  	// time.Time + max consensus msg size
    26  	maxMsgSizeBytes = maxMsgSize + 24
    27  
    28  	// how often the WAL should be sync'd during period sync'ing
    29  	walDefaultFlushInterval = 2 * time.Second
    30  )
    31  
    32  //--------------------------------------------------------
    33  // types and functions for savings consensus messages
    34  
    35  // TimedWALMessage wraps WALMessage and adds Time for debugging purposes.
    36  type TimedWALMessage struct {
    37  	Time time.Time  `json:"time"`
    38  	Msg  WALMessage `json:"msg"`
    39  }
    40  
    41  // EndHeightMessage marks the end of the given height inside WAL.
    42  // @internal used by scripts/wal2json util.
    43  type EndHeightMessage struct {
    44  	Height int64 `json:"height"`
    45  }
    46  
    47  type WALMessage interface{}
    48  
    49  func init() {
    50  	tmjson.RegisterType(msgInfo{}, "ostracon/wal/MsgInfo")
    51  	tmjson.RegisterType(timeoutInfo{}, "ostracon/wal/TimeoutInfo")
    52  	tmjson.RegisterType(EndHeightMessage{}, "ostracon/wal/EndHeightMessage")
    53  }
    54  
    55  //--------------------------------------------------------
    56  // Simple write-ahead logger
    57  
    58  // WAL is an interface for any write-ahead logger.
    59  type WAL interface {
    60  	Write(WALMessage) error
    61  	WriteSync(WALMessage) error
    62  	FlushAndSync() error
    63  
    64  	SearchForEndHeight(height int64, options *WALSearchOptions) (rd io.ReadCloser, found bool, err error)
    65  
    66  	// service methods
    67  	Start() error
    68  	Stop() error
    69  	Wait()
    70  }
    71  
    72  // Write ahead logger writes msgs to disk before they are processed.
    73  // Can be used for crash-recovery and deterministic replay.
    74  // TODO: currently the wal is overwritten during replay catchup, give it a mode
    75  // so it's either reading or appending - must read to end to start appending
    76  // again.
    77  type BaseWAL struct {
    78  	service.BaseService
    79  
    80  	group *auto.Group
    81  
    82  	enc *WALEncoder
    83  
    84  	flushTicker   *time.Ticker
    85  	flushInterval time.Duration
    86  }
    87  
    88  var _ WAL = &BaseWAL{}
    89  
    90  // NewWAL returns a new write-ahead logger based on `baseWAL`, which implements
    91  // WAL. It's flushed and synced to disk every 2s and once when stopped.
    92  func NewWAL(walFile string, groupOptions ...func(*auto.Group)) (*BaseWAL, error) {
    93  	err := tmos.EnsureDir(filepath.Dir(walFile), 0700)
    94  	if err != nil {
    95  		return nil, fmt.Errorf("failed to ensure WAL directory is in place: %w", err)
    96  	}
    97  
    98  	group, err := auto.OpenGroup(walFile, groupOptions...)
    99  	if err != nil {
   100  		return nil, err
   101  	}
   102  	wal := &BaseWAL{
   103  		group:         group,
   104  		enc:           NewWALEncoder(group),
   105  		flushInterval: walDefaultFlushInterval,
   106  	}
   107  	wal.BaseService = *service.NewBaseService(nil, "baseWAL", wal)
   108  	return wal, nil
   109  }
   110  
   111  // SetFlushInterval allows us to override the periodic flush interval for the WAL.
   112  func (wal *BaseWAL) SetFlushInterval(i time.Duration) {
   113  	wal.flushInterval = i
   114  }
   115  
   116  func (wal *BaseWAL) Group() *auto.Group {
   117  	return wal.group
   118  }
   119  
   120  func (wal *BaseWAL) SetLogger(l log.Logger) {
   121  	wal.BaseService.Logger = l
   122  	wal.group.SetLogger(l)
   123  }
   124  
   125  func (wal *BaseWAL) OnStart() error {
   126  	size, err := wal.group.Head.Size()
   127  	if err != nil {
   128  		return err
   129  	} else if size == 0 {
   130  		if err := wal.WriteSync(EndHeightMessage{0}); err != nil {
   131  			return err
   132  		}
   133  	}
   134  	err = wal.group.Start()
   135  	if err != nil {
   136  		return err
   137  	}
   138  	wal.flushTicker = time.NewTicker(wal.flushInterval)
   139  	go wal.processFlushTicks()
   140  	return nil
   141  }
   142  
   143  func (wal *BaseWAL) processFlushTicks() {
   144  	for {
   145  		select {
   146  		case <-wal.flushTicker.C:
   147  			if err := wal.FlushAndSync(); err != nil {
   148  				wal.Logger.Error("Periodic WAL flush failed", "err", err)
   149  			}
   150  		case <-wal.Quit():
   151  			return
   152  		}
   153  	}
   154  }
   155  
   156  // FlushAndSync flushes and fsync's the underlying group's data to disk.
   157  // See auto#FlushAndSync
   158  func (wal *BaseWAL) FlushAndSync() error {
   159  	return wal.group.FlushAndSync()
   160  }
   161  
   162  // Stop the underlying autofile group.
   163  // Use Wait() to ensure it's finished shutting down
   164  // before cleaning up files.
   165  func (wal *BaseWAL) OnStop() {
   166  	wal.flushTicker.Stop()
   167  	if err := wal.FlushAndSync(); err != nil {
   168  		wal.Logger.Error("error on flush data to disk", "error", err)
   169  	}
   170  	if err := wal.group.Stop(); err != nil {
   171  		wal.Logger.Error("error trying to stop wal", "error", err)
   172  	}
   173  	wal.group.Close()
   174  }
   175  
   176  // Wait for the underlying autofile group to finish shutting down
   177  // so it's safe to cleanup files.
   178  func (wal *BaseWAL) Wait() {
   179  	wal.group.Wait()
   180  }
   181  
   182  // Write is called in newStep and for each receive on the
   183  // peerMsgQueue and the timeoutTicker.
   184  // NOTE: does not call fsync()
   185  func (wal *BaseWAL) Write(msg WALMessage) error {
   186  	if wal == nil {
   187  		return nil
   188  	}
   189  
   190  	if err := wal.enc.Encode(&TimedWALMessage{tmtime.Now(), msg}); err != nil {
   191  		wal.Logger.Error("Error writing msg to consensus wal. WARNING: recover may not be possible for the current height",
   192  			"err", err, "msg", msg)
   193  		return err
   194  	}
   195  
   196  	return nil
   197  }
   198  
   199  // WriteSync is called when we receive a msg from ourselves
   200  // so that we write to disk before sending signed messages.
   201  // NOTE: calls fsync()
   202  func (wal *BaseWAL) WriteSync(msg WALMessage) error {
   203  	if wal == nil {
   204  		return nil
   205  	}
   206  
   207  	if err := wal.Write(msg); err != nil {
   208  		return err
   209  	}
   210  
   211  	if err := wal.FlushAndSync(); err != nil {
   212  		wal.Logger.Error(`WriteSync failed to flush consensus wal.
   213  		WARNING: may result in creating alternative proposals / votes for the current height iff the node restarted`,
   214  			"err", err)
   215  		return err
   216  	}
   217  
   218  	return nil
   219  }
   220  
   221  // WALSearchOptions are optional arguments to SearchForEndHeight.
   222  type WALSearchOptions struct {
   223  	// IgnoreDataCorruptionErrors set to true will result in skipping data corruption errors.
   224  	IgnoreDataCorruptionErrors bool
   225  }
   226  
   227  // SearchForEndHeight searches for the EndHeightMessage with the given height
   228  // and returns an auto.GroupReader, whenever it was found or not and an error.
   229  // Group reader will be nil if found equals false.
   230  //
   231  // CONTRACT: caller must close group reader.
   232  func (wal *BaseWAL) SearchForEndHeight(
   233  	height int64,
   234  	options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) {
   235  	var (
   236  		msg *TimedWALMessage
   237  		gr  *auto.GroupReader
   238  	)
   239  	lastHeightFound := int64(-1)
   240  
   241  	// NOTE: starting from the last file in the group because we're usually
   242  	// searching for the last height. See replay.go
   243  	min, max := wal.group.MinIndex(), wal.group.MaxIndex()
   244  	wal.Logger.Info("Searching for height", "height", height, "min", min, "max", max)
   245  	for index := max; index >= min; index-- {
   246  		gr, err = wal.group.NewReader(index)
   247  		if err != nil {
   248  			return nil, false, err
   249  		}
   250  
   251  		dec := NewWALDecoder(gr)
   252  		for {
   253  			msg, err = dec.Decode()
   254  			if err == io.EOF {
   255  				// OPTIMISATION: no need to look for height in older files if we've seen h < height
   256  				if lastHeightFound > 0 && lastHeightFound < height {
   257  					gr.Close()
   258  					return nil, false, nil
   259  				}
   260  				// check next file
   261  				break
   262  			}
   263  			if options.IgnoreDataCorruptionErrors && IsDataCorruptionError(err) {
   264  				wal.Logger.Error("Corrupted entry. Skipping...", "err", err)
   265  				// do nothing
   266  				continue
   267  			} else if err != nil {
   268  				gr.Close()
   269  				return nil, false, err
   270  			}
   271  
   272  			if m, ok := msg.Msg.(EndHeightMessage); ok {
   273  				lastHeightFound = m.Height
   274  				if m.Height == height { // found
   275  					wal.Logger.Info("Found", "height", height, "index", index)
   276  					return gr, true, nil
   277  				}
   278  			}
   279  		}
   280  		gr.Close()
   281  	}
   282  
   283  	return nil, false, nil
   284  }
   285  
   286  // A WALEncoder writes custom-encoded WAL messages to an output stream.
   287  //
   288  // Format: 4 bytes CRC sum + 4 bytes length + arbitrary-length value
   289  type WALEncoder struct {
   290  	wr io.Writer
   291  }
   292  
   293  // NewWALEncoder returns a new encoder that writes to wr.
   294  func NewWALEncoder(wr io.Writer) *WALEncoder {
   295  	return &WALEncoder{wr}
   296  }
   297  
   298  // Encode writes the custom encoding of v to the stream. It returns an error if
   299  // the encoded size of v is greater than 1MB. Any error encountered
   300  // during the write is also returned.
   301  func (enc *WALEncoder) Encode(v *TimedWALMessage) error {
   302  	pbMsg, err := WALToProto(v.Msg)
   303  	if err != nil {
   304  		return err
   305  	}
   306  	pv := tmcons.TimedWALMessage{
   307  		Time: v.Time,
   308  		Msg:  pbMsg,
   309  	}
   310  
   311  	data, err := proto.Marshal(&pv)
   312  	if err != nil {
   313  		panic(fmt.Errorf("encode timed wall message failure: %w", err))
   314  	}
   315  
   316  	crc := crc32.Checksum(data, crc32c)
   317  	length := uint32(len(data))
   318  	if length > maxMsgSizeBytes {
   319  		return fmt.Errorf("msg is too big: %d bytes, max: %d bytes", length, maxMsgSizeBytes)
   320  	}
   321  	totalLength := 8 + int(length)
   322  
   323  	msg := make([]byte, totalLength)
   324  	binary.BigEndian.PutUint32(msg[0:4], crc)
   325  	binary.BigEndian.PutUint32(msg[4:8], length)
   326  	copy(msg[8:], data)
   327  
   328  	_, err = enc.wr.Write(msg)
   329  	return err
   330  }
   331  
   332  // IsDataCorruptionError returns true if data has been corrupted inside WAL.
   333  func IsDataCorruptionError(err error) bool {
   334  	_, ok := err.(DataCorruptionError)
   335  	return ok
   336  }
   337  
   338  // DataCorruptionError is an error that occures if data on disk was corrupted.
   339  type DataCorruptionError struct {
   340  	cause error
   341  }
   342  
   343  func (e DataCorruptionError) Error() string {
   344  	return fmt.Sprintf("DataCorruptionError[%v]", e.cause)
   345  }
   346  
   347  func (e DataCorruptionError) Cause() error {
   348  	return e.cause
   349  }
   350  
   351  // A WALDecoder reads and decodes custom-encoded WAL messages from an input
   352  // stream. See WALEncoder for the format used.
   353  //
   354  // It will also compare the checksums and make sure data size is equal to the
   355  // length from the header. If that is not the case, error will be returned.
   356  type WALDecoder struct {
   357  	rd io.Reader
   358  }
   359  
   360  // NewWALDecoder returns a new decoder that reads from rd.
   361  func NewWALDecoder(rd io.Reader) *WALDecoder {
   362  	return &WALDecoder{rd}
   363  }
   364  
   365  // Decode reads the next custom-encoded value from its reader and returns it.
   366  func (dec *WALDecoder) Decode() (*TimedWALMessage, error) {
   367  	b := make([]byte, 4)
   368  
   369  	_, err := dec.rd.Read(b)
   370  	if errors.Is(err, io.EOF) {
   371  		return nil, err
   372  	}
   373  	if err != nil {
   374  		return nil, DataCorruptionError{fmt.Errorf("failed to read checksum: %v", err)}
   375  	}
   376  	crc := binary.BigEndian.Uint32(b)
   377  
   378  	b = make([]byte, 4)
   379  	_, err = dec.rd.Read(b)
   380  	if err != nil {
   381  		return nil, DataCorruptionError{fmt.Errorf("failed to read length: %v", err)}
   382  	}
   383  	length := binary.BigEndian.Uint32(b)
   384  
   385  	if length > maxMsgSizeBytes {
   386  		return nil, DataCorruptionError{fmt.Errorf(
   387  			"length %d exceeded maximum possible value of %d bytes",
   388  			length,
   389  			maxMsgSizeBytes)}
   390  	}
   391  
   392  	data := make([]byte, length)
   393  	n, err := dec.rd.Read(data)
   394  	if err != nil {
   395  		return nil, DataCorruptionError{fmt.Errorf("failed to read data: %v (read: %d, wanted: %d)", err, n, length)}
   396  	}
   397  
   398  	// check checksum before decoding data
   399  	actualCRC := crc32.Checksum(data, crc32c)
   400  	if actualCRC != crc {
   401  		return nil, DataCorruptionError{fmt.Errorf("checksums do not match: read: %v, actual: %v", crc, actualCRC)}
   402  	}
   403  
   404  	var res = new(tmcons.TimedWALMessage)
   405  	err = proto.Unmarshal(data, res)
   406  	if err != nil {
   407  		return nil, DataCorruptionError{fmt.Errorf("failed to decode data: %v", err)}
   408  	}
   409  
   410  	walMsg, err := WALFromProto(res.Msg)
   411  	if err != nil {
   412  		return nil, DataCorruptionError{fmt.Errorf("failed to convert from proto: %w", err)}
   413  	}
   414  	tMsgWal := &TimedWALMessage{
   415  		Time: res.Time,
   416  		Msg:  walMsg,
   417  	}
   418  
   419  	return tMsgWal, err
   420  }
   421  
   422  type nilWAL struct{}
   423  
   424  var _ WAL = nilWAL{}
   425  
   426  func (nilWAL) Write(m WALMessage) error     { return nil }
   427  func (nilWAL) WriteSync(m WALMessage) error { return nil }
   428  func (nilWAL) FlushAndSync() error          { return nil }
   429  func (nilWAL) SearchForEndHeight(height int64, options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) {
   430  	return nil, false, nil
   431  }
   432  func (nilWAL) Start() error { return nil }
   433  func (nilWAL) Stop() error  { return nil }
   434  func (nilWAL) Wait()        {}