github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ingester/checkpoint.go (about)

     1  package ingester
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	fmt "fmt"
     7  	"io/ioutil"
     8  	"os"
     9  	"path/filepath"
    10  	"regexp"
    11  	"strconv"
    12  	"time"
    13  
    14  	"github.com/dustin/go-humanize"
    15  	"github.com/go-kit/log"
    16  	"github.com/go-kit/log/level"
    17  	"github.com/gogo/protobuf/proto"
    18  	"github.com/pkg/errors"
    19  	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
    20  	"github.com/prometheus/prometheus/tsdb/fileutil"
    21  	"github.com/prometheus/prometheus/tsdb/wal"
    22  	prompool "github.com/prometheus/prometheus/util/pool"
    23  
    24  	"github.com/grafana/loki/pkg/chunkenc"
    25  	"github.com/grafana/loki/pkg/logproto"
    26  	util_log "github.com/grafana/loki/pkg/util/log"
    27  	"github.com/grafana/loki/pkg/util/pool"
    28  )
    29  
    30  var (
    31  	// todo(ctovena) those pools should be in factor of the actual configuration (blocksize, targetsize).
    32  	// Starting with something sane first then we can refine with more experience.
    33  
    34  	// Buckets [1KB 2KB 4KB 16KB 32KB  to 4MB] by 2
    35  	chunksBufferPool = pool.NewBuffer(1024, 4*1024*1024, 2)
    36  	// Buckets [64B 128B 256B 512B... to 2MB] by 2
    37  	headBufferPool = pool.NewBuffer(64, 2*1024*1024, 2)
    38  )
    39  
    40  type chunkWithBuffer struct {
    41  	blocks, head *bytes.Buffer
    42  	Chunk
    43  }
    44  
    45  // The passed wireChunks slice is for re-use.
    46  func toWireChunks(descs []chunkDesc, wireChunks []chunkWithBuffer) ([]chunkWithBuffer, error) {
    47  	// release memory from previous list of chunks.
    48  	for _, wc := range wireChunks {
    49  		chunksBufferPool.Put(wc.blocks)
    50  		headBufferPool.Put(wc.head)
    51  		wc.Data = nil
    52  		wc.Head = nil
    53  	}
    54  
    55  	if cap(wireChunks) < len(descs) {
    56  		wireChunks = make([]chunkWithBuffer, len(descs))
    57  	} else {
    58  		wireChunks = wireChunks[:len(descs)]
    59  	}
    60  
    61  	for i, d := range descs {
    62  		from, to := d.chunk.Bounds()
    63  		chunkSize, headSize := d.chunk.CheckpointSize()
    64  
    65  		wireChunk := chunkWithBuffer{
    66  			Chunk: Chunk{
    67  				From:        from,
    68  				To:          to,
    69  				Closed:      d.closed,
    70  				FlushedAt:   d.flushed,
    71  				LastUpdated: d.lastUpdated,
    72  				Synced:      d.synced,
    73  			},
    74  			blocks: chunksBufferPool.Get(chunkSize),
    75  			head:   headBufferPool.Get(headSize),
    76  		}
    77  
    78  		err := d.chunk.SerializeForCheckpointTo(
    79  			wireChunk.blocks,
    80  			wireChunk.head,
    81  		)
    82  		if err != nil {
    83  			return nil, err
    84  		}
    85  
    86  		wireChunk.Data = wireChunk.blocks.Bytes()
    87  		wireChunk.Head = wireChunk.head.Bytes()
    88  		wireChunks[i] = wireChunk
    89  	}
    90  	return wireChunks, nil
    91  }
    92  
    93  func fromWireChunks(conf *Config, wireChunks []Chunk) ([]chunkDesc, error) {
    94  	descs := make([]chunkDesc, 0, len(wireChunks))
    95  	for _, c := range wireChunks {
    96  		desc := chunkDesc{
    97  			closed:      c.Closed,
    98  			synced:      c.Synced,
    99  			flushed:     c.FlushedAt,
   100  			lastUpdated: c.LastUpdated,
   101  		}
   102  
   103  		// Always use Unordered headblocks during replay
   104  		// to ensure Loki can effectively replay an unordered-friendly
   105  		// WAL into a new configuration that disables unordered writes.
   106  		hbType := chunkenc.UnorderedHeadBlockFmt
   107  		mc, err := chunkenc.MemchunkFromCheckpoint(c.Data, c.Head, hbType, conf.BlockSize, conf.TargetChunkSize)
   108  		if err != nil {
   109  			return nil, err
   110  		}
   111  		desc.chunk = mc
   112  
   113  		descs = append(descs, desc)
   114  	}
   115  	return descs, nil
   116  }
   117  
   118  // nolint:interfacer
   119  func decodeCheckpointRecord(rec []byte, s *Series) error {
   120  	// TODO(owen-d): reduce allocs
   121  	// The proto unmarshaling code will retain references to the underlying []byte it's passed
   122  	// in order to reduce allocs. This is harmful to us because when reading from a WAL, the []byte
   123  	// is only guaranteed to be valid between calls to Next().
   124  	// Therefore, we copy it to avoid this problem.
   125  	cpy := make([]byte, len(rec))
   126  	copy(cpy, rec)
   127  
   128  	switch RecordType(cpy[0]) {
   129  	case CheckpointRecord:
   130  		return proto.Unmarshal(cpy[1:], s)
   131  	default:
   132  		return errors.Errorf("unexpected record type: %d", rec[0])
   133  	}
   134  }
   135  
   136  func encodeWithTypeHeader(m *Series, typ RecordType, buf []byte) ([]byte, error) {
   137  	size := m.Size()
   138  	if cap(buf) < size+1 {
   139  		buf = make([]byte, size+1)
   140  	}
   141  	_, err := m.MarshalTo(buf[1 : size+1])
   142  	if err != nil {
   143  		return nil, err
   144  	}
   145  	buf[0] = byte(typ)
   146  	return buf[:size+1], nil
   147  }
   148  
   149  type SeriesWithErr struct {
   150  	Err    error
   151  	Series *Series
   152  }
   153  
   154  type SeriesIter interface {
   155  	Count() int
   156  	Iter() *streamIterator
   157  	Stop()
   158  }
   159  
   160  type ingesterSeriesIter struct {
   161  	ing ingesterInstances
   162  
   163  	done chan struct{}
   164  }
   165  
   166  type ingesterInstances interface {
   167  	getInstances() []*instance
   168  }
   169  
   170  func newIngesterSeriesIter(ing ingesterInstances) *ingesterSeriesIter {
   171  	return &ingesterSeriesIter{
   172  		ing:  ing,
   173  		done: make(chan struct{}),
   174  	}
   175  }
   176  
   177  func (i *ingesterSeriesIter) Count() (ct int) {
   178  	for _, inst := range i.ing.getInstances() {
   179  		ct += inst.numStreams()
   180  	}
   181  	return ct
   182  }
   183  
   184  func (i *ingesterSeriesIter) Stop() {
   185  	close(i.done)
   186  }
   187  
   188  func (i *ingesterSeriesIter) Iter() *streamIterator {
   189  	return newStreamsIterator(i.ing)
   190  }
   191  
   192  type streamInstance struct {
   193  	id      string
   194  	streams []*stream
   195  }
   196  
   197  type streamIterator struct {
   198  	instances []streamInstance
   199  
   200  	current Series
   201  	buffer  []chunkWithBuffer
   202  	err     error
   203  }
   204  
   205  // newStreamsIterator returns a new stream iterators that iterates over one instance at a time, then
   206  // each stream per instances.
   207  func newStreamsIterator(ing ingesterInstances) *streamIterator {
   208  	instances := ing.getInstances()
   209  	streamInstances := make([]streamInstance, len(instances))
   210  	for i, inst := range instances {
   211  		streams := make([]*stream, 0, inst.streams.Len())
   212  		_ = inst.forAllStreams(context.Background(), func(s *stream) error {
   213  			streams = append(streams, s)
   214  			return nil
   215  		})
   216  		streamInstances[i] = streamInstance{
   217  			streams: streams,
   218  			id:      inst.instanceID,
   219  		}
   220  	}
   221  	return &streamIterator{
   222  		instances: streamInstances,
   223  	}
   224  }
   225  
   226  // Next loads the next stream of the current instance.
   227  // If the instance is empty, it moves to the next instance until there is no more.
   228  // Return true if there's a next stream, each successful calls will replace the current stream.
   229  func (s *streamIterator) Next() bool {
   230  	if len(s.instances) == 0 {
   231  		s.instances = nil
   232  		return false
   233  	}
   234  	currentInstance := s.instances[0]
   235  	if len(currentInstance.streams) == 0 {
   236  		s.instances = s.instances[1:]
   237  		return s.Next()
   238  	}
   239  
   240  	// current stream
   241  	stream := currentInstance.streams[0]
   242  
   243  	// remove the first stream
   244  	s.instances[0].streams = s.instances[0].streams[1:]
   245  
   246  	stream.chunkMtx.RLock()
   247  	defer stream.chunkMtx.RUnlock()
   248  
   249  	if len(stream.chunks) < 1 {
   250  		// it's possible the stream has been flushed to storage
   251  		// in between starting the checkpointing process and
   252  		// checkpointing this stream.
   253  		return s.Next()
   254  	}
   255  	chunks, err := toWireChunks(stream.chunks, s.buffer)
   256  	if err != nil {
   257  		s.err = err
   258  		return false
   259  	}
   260  	s.buffer = chunks
   261  
   262  	s.current.Chunks = s.current.Chunks[:0]
   263  	if cap(s.current.Chunks) == 0 {
   264  		s.current.Chunks = make([]Chunk, 0, len(chunks))
   265  	}
   266  
   267  	for _, c := range chunks {
   268  		s.current.Chunks = append(s.current.Chunks, c.Chunk)
   269  	}
   270  
   271  	s.current.UserID = currentInstance.id
   272  	s.current.Fingerprint = uint64(stream.fp)
   273  	s.current.Labels = logproto.FromLabelsToLabelAdapters(stream.labels)
   274  
   275  	s.current.To = stream.lastLine.ts
   276  	s.current.LastLine = stream.lastLine.content
   277  	s.current.EntryCt = stream.entryCt
   278  	s.current.HighestTs = stream.highestTs
   279  
   280  	return true
   281  }
   282  
   283  // Err returns an errors thrown while iterating over the streams.
   284  func (s *streamIterator) Error() error {
   285  	return s.err
   286  }
   287  
   288  // Stream is serializable (for checkpointing) stream of chunks.
   289  // NOTE: the series is re-used between successful Next calls.
   290  // This means you should make a copy or use the data before calling Next.
   291  func (s *streamIterator) Stream() *Series {
   292  	return &s.current
   293  }
   294  
   295  type CheckpointWriter interface {
   296  	// Advances current checkpoint, can also signal a no-op.
   297  	Advance() (noop bool, err error)
   298  	Write(*Series) error
   299  	// Closes current checkpoint.
   300  	Close(abort bool) error
   301  }
   302  
   303  type walLogger interface {
   304  	Log(recs ...[]byte) error
   305  	Close() error
   306  	Dir() string
   307  }
   308  
   309  type WALCheckpointWriter struct {
   310  	metrics    *ingesterMetrics
   311  	segmentWAL *wal.WAL
   312  
   313  	checkpointWAL walLogger
   314  	lastSegment   int    // name of the last segment guaranteed to be covered by the checkpoint
   315  	final         string // filename to atomically rotate upon completion
   316  	bufSize       int
   317  	recs          [][]byte
   318  }
   319  
   320  func (w *WALCheckpointWriter) Advance() (bool, error) {
   321  	_, lastSegment, err := wal.Segments(w.segmentWAL.Dir())
   322  	if err != nil {
   323  		return false, err
   324  	}
   325  
   326  	if lastSegment < 0 {
   327  		// There are no WAL segments. No need of checkpoint yet.
   328  		return true, nil
   329  	}
   330  
   331  	// First we advance the wal segment internally to ensure we don't overlap a previous checkpoint in
   332  	// low throughput scenarios and to minimize segment replays on top of checkpoints.
   333  	if err := w.segmentWAL.NextSegment(); err != nil {
   334  		return false, err
   335  	}
   336  
   337  	// Checkpoint is named after the last WAL segment present so that when replaying the WAL
   338  	// we can start from that particular WAL segment.
   339  	checkpointDir := filepath.Join(w.segmentWAL.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", lastSegment))
   340  	level.Info(util_log.Logger).Log("msg", "attempting checkpoint for", "dir", checkpointDir)
   341  	checkpointDirTemp := checkpointDir + ".tmp"
   342  
   343  	// cleanup any old partial checkpoints
   344  	if _, err := os.Stat(checkpointDirTemp); err == nil {
   345  		if err := os.RemoveAll(checkpointDirTemp); err != nil {
   346  			level.Error(util_log.Logger).Log("msg", "unable to cleanup old tmp checkpoint", "dir", checkpointDirTemp)
   347  			return false, err
   348  		}
   349  	}
   350  
   351  	if err := os.MkdirAll(checkpointDirTemp, 0777); err != nil {
   352  		return false, errors.Wrap(err, "create checkpoint dir")
   353  	}
   354  
   355  	checkpoint, err := wal.NewSize(log.With(util_log.Logger, "component", "checkpoint_wal"), nil, checkpointDirTemp, walSegmentSize, false)
   356  	if err != nil {
   357  		return false, errors.Wrap(err, "open checkpoint")
   358  	}
   359  
   360  	w.checkpointWAL = checkpoint
   361  	w.lastSegment = lastSegment
   362  	w.final = checkpointDir
   363  
   364  	return false, nil
   365  }
   366  
   367  // Buckets [64KB to 256MB] by 2
   368  var recordBufferPool = prompool.New(1<<16, 1<<28, 2, func(size int) interface{} { return make([]byte, 0, size) })
   369  
   370  func (w *WALCheckpointWriter) Write(s *Series) error {
   371  	size := s.Size() + 1 // +1 for header
   372  	buf := recordBufferPool.Get(size).([]byte)[:size]
   373  
   374  	b, err := encodeWithTypeHeader(s, CheckpointRecord, buf)
   375  	if err != nil {
   376  		return err
   377  	}
   378  
   379  	w.recs = append(w.recs, b)
   380  	w.bufSize += len(b)
   381  	level.Debug(util_log.Logger).Log("msg", "writing series", "size", humanize.Bytes(uint64(len(b))))
   382  
   383  	// 1MB
   384  	if w.bufSize > 1<<20 {
   385  		if err := w.flush(); err != nil {
   386  			return err
   387  		}
   388  	}
   389  	return nil
   390  }
   391  
   392  func (w *WALCheckpointWriter) flush() error {
   393  	level.Debug(util_log.Logger).Log("msg", "flushing series", "totalSize", humanize.Bytes(uint64(w.bufSize)), "series", len(w.recs))
   394  	if err := w.checkpointWAL.Log(w.recs...); err != nil {
   395  		return err
   396  	}
   397  	w.metrics.checkpointLoggedBytesTotal.Add(float64(w.bufSize))
   398  	for _, b := range w.recs {
   399  		recordBufferPool.Put(b)
   400  	}
   401  	w.recs = w.recs[:0]
   402  	w.bufSize = 0
   403  	return nil
   404  }
   405  
   406  const checkpointPrefix = "checkpoint."
   407  
   408  var checkpointRe = regexp.MustCompile("^" + regexp.QuoteMeta(checkpointPrefix) + "(\\d+)(\\.tmp)?$")
   409  
   410  // checkpointIndex returns the index of a given checkpoint file. It handles
   411  // both regular and temporary checkpoints according to the includeTmp flag. If
   412  // the file is not a checkpoint it returns an error.
   413  func checkpointIndex(filename string, includeTmp bool) (int, error) {
   414  	result := checkpointRe.FindStringSubmatch(filename)
   415  	if len(result) < 2 {
   416  		return 0, errors.New("file is not a checkpoint")
   417  	}
   418  	// Filter out temporary checkpoints if desired.
   419  	if !includeTmp && len(result) == 3 && result[2] != "" {
   420  		return 0, errors.New("temporary checkpoint")
   421  	}
   422  	return strconv.Atoi(result[1])
   423  }
   424  
   425  // lastCheckpoint returns the directory name and index of the most recent checkpoint.
   426  // If dir does not contain any checkpoints, -1 is returned as index.
   427  func lastCheckpoint(dir string) (string, int, error) {
   428  	dirs, err := ioutil.ReadDir(dir)
   429  	if err != nil {
   430  		return "", -1, err
   431  	}
   432  	var (
   433  		maxIdx        = -1
   434  		checkpointDir string
   435  	)
   436  	// There may be multiple checkpoints left, so select the one with max index.
   437  	for i := 0; i < len(dirs); i++ {
   438  		di := dirs[i]
   439  
   440  		idx, err := checkpointIndex(di.Name(), false)
   441  		if err != nil {
   442  			continue
   443  		}
   444  		if !di.IsDir() {
   445  			return "", -1, fmt.Errorf("checkpoint %s is not a directory", di.Name())
   446  		}
   447  		if idx > maxIdx {
   448  			checkpointDir = di.Name()
   449  			maxIdx = idx
   450  		}
   451  	}
   452  	if maxIdx >= 0 {
   453  		return filepath.Join(dir, checkpointDir), maxIdx, nil
   454  	}
   455  	return "", -1, nil
   456  }
   457  
   458  // deleteCheckpoints deletes all checkpoints in a directory which is < maxIndex.
   459  func (w *WALCheckpointWriter) deleteCheckpoints(maxIndex int) (err error) {
   460  	w.metrics.checkpointDeleteTotal.Inc()
   461  	defer func() {
   462  		if err != nil {
   463  			w.metrics.checkpointDeleteFail.Inc()
   464  		}
   465  	}()
   466  
   467  	errs := tsdb_errors.NewMulti()
   468  
   469  	files, err := ioutil.ReadDir(w.segmentWAL.Dir())
   470  	if err != nil {
   471  		return err
   472  	}
   473  	for _, fi := range files {
   474  		index, err := checkpointIndex(fi.Name(), true)
   475  		if err != nil || index >= maxIndex {
   476  			continue
   477  		}
   478  		if err := os.RemoveAll(filepath.Join(w.segmentWAL.Dir(), fi.Name())); err != nil {
   479  			errs.Add(err)
   480  		}
   481  	}
   482  	return errs.Err()
   483  }
   484  
   485  func (w *WALCheckpointWriter) Close(abort bool) error {
   486  	if len(w.recs) > 0 {
   487  		if err := w.flush(); err != nil {
   488  			return err
   489  		}
   490  	}
   491  	if err := w.checkpointWAL.Close(); err != nil {
   492  		return err
   493  	}
   494  
   495  	if abort {
   496  		return os.RemoveAll(w.checkpointWAL.Dir())
   497  	}
   498  
   499  	if err := fileutil.Replace(w.checkpointWAL.Dir(), w.final); err != nil {
   500  		return errors.Wrap(err, "rename checkpoint directory")
   501  	}
   502  	level.Info(util_log.Logger).Log("msg", "atomic checkpoint finished", "old", w.checkpointWAL.Dir(), "new", w.final)
   503  	// We delete the WAL segments which are before the previous checkpoint and not before the
   504  	// current checkpoint created. This is because if the latest checkpoint is corrupted for any reason, we
   505  	// should be able to recover from the older checkpoint which would need the older WAL segments.
   506  	if err := w.segmentWAL.Truncate(w.lastSegment + 1); err != nil {
   507  		// It is fine to have old WAL segments hanging around if deletion failed.
   508  		// We can try again next time.
   509  		level.Error(util_log.Logger).Log("msg", "error deleting old WAL segments", "err", err, "lastSegment", w.lastSegment)
   510  	}
   511  
   512  	if w.lastSegment >= 0 {
   513  		if err := w.deleteCheckpoints(w.lastSegment); err != nil {
   514  			// It is fine to have old checkpoints hanging around if deletion failed.
   515  			// We can try again next time.
   516  			level.Error(util_log.Logger).Log("msg", "error deleting old checkpoint", "err", err)
   517  		}
   518  	}
   519  
   520  	return nil
   521  }
   522  
   523  type Checkpointer struct {
   524  	dur     time.Duration
   525  	iter    SeriesIter
   526  	writer  CheckpointWriter
   527  	metrics *ingesterMetrics
   528  
   529  	quit <-chan struct{}
   530  }
   531  
   532  func NewCheckpointer(dur time.Duration, iter SeriesIter, writer CheckpointWriter, metrics *ingesterMetrics, quit <-chan struct{}) *Checkpointer {
   533  	return &Checkpointer{
   534  		dur:     dur,
   535  		iter:    iter,
   536  		writer:  writer,
   537  		metrics: metrics,
   538  		quit:    quit,
   539  	}
   540  }
   541  
   542  func (c *Checkpointer) PerformCheckpoint() (err error) {
   543  	noop, err := c.writer.Advance()
   544  	if err != nil {
   545  		return err
   546  	}
   547  	if noop {
   548  		return nil
   549  	}
   550  
   551  	c.metrics.checkpointCreationTotal.Inc()
   552  	defer func() {
   553  		if err != nil {
   554  			c.metrics.checkpointCreationFail.Inc()
   555  		}
   556  	}()
   557  	// signal whether checkpoint writes should be amortized or burst
   558  	var immediate bool
   559  	n := c.iter.Count()
   560  	if n < 1 {
   561  		return c.writer.Close(false)
   562  	}
   563  
   564  	// Give a 10% buffer to the checkpoint duration in order to account for
   565  	// new series, slow writes, etc.
   566  	perSeriesDuration := (90 * c.dur) / (100 * time.Duration(n))
   567  
   568  	ticker := time.NewTicker(perSeriesDuration)
   569  	defer ticker.Stop()
   570  	start := time.Now()
   571  	defer func() {
   572  		elapsed := time.Since(start)
   573  		level.Info(util_log.Logger).Log("msg", "checkpoint done", "time", elapsed.String())
   574  		c.metrics.checkpointDuration.Observe(elapsed.Seconds())
   575  	}()
   576  
   577  	iter := c.iter.Iter()
   578  	for iter.Next() {
   579  		if err := c.writer.Write(iter.Stream()); err != nil {
   580  			return err
   581  		}
   582  
   583  		if !immediate {
   584  			if time.Since(start) > c.dur {
   585  				// This indicates the checkpoint is taking too long; stop waiting
   586  				// and flush the remaining series as fast as possible.
   587  				immediate = true
   588  				continue
   589  			}
   590  		}
   591  
   592  		select {
   593  		case <-c.quit:
   594  			return c.writer.Close(true)
   595  		case <-ticker.C:
   596  		}
   597  
   598  	}
   599  
   600  	if iter.Error() != nil {
   601  		return iter.Error()
   602  	}
   603  
   604  	return c.writer.Close(false)
   605  }
   606  
   607  func (c *Checkpointer) Run() {
   608  	ticker := time.NewTicker(c.dur)
   609  	defer ticker.Stop()
   610  	defer c.iter.Stop()
   611  
   612  	for {
   613  		select {
   614  		case <-ticker.C:
   615  			level.Info(util_log.Logger).Log("msg", "starting checkpoint")
   616  			if err := c.PerformCheckpoint(); err != nil {
   617  				level.Error(util_log.Logger).Log("msg", "error checkpointing series", "err", err)
   618  				continue
   619  			}
   620  		case <-c.quit:
   621  			return
   622  		}
   623  	}
   624  }
   625  
   626  func unflushedChunks(descs []chunkDesc) []chunkDesc {
   627  	filtered := make([]chunkDesc, 0, len(descs))
   628  
   629  	for _, d := range descs {
   630  		if d.flushed.IsZero() {
   631  			filtered = append(filtered, d)
   632  		}
   633  	}
   634  
   635  	return filtered
   636  }