github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ingester/recovery.go (about)

     1  package ingester
     2  
     3  import (
     4  	io "io"
     5  	"runtime"
     6  	"sync"
     7  
     8  	"github.com/go-kit/log/level"
     9  	"github.com/pkg/errors"
    10  	"github.com/prometheus/prometheus/tsdb/chunks"
    11  	"github.com/prometheus/prometheus/tsdb/record"
    12  	"github.com/prometheus/prometheus/tsdb/wal"
    13  	"golang.org/x/net/context"
    14  
    15  	"github.com/grafana/loki/pkg/logproto"
    16  	util_log "github.com/grafana/loki/pkg/util/log"
    17  )
    18  
    19  type WALReader interface {
    20  	Next() bool
    21  	Err() error
    22  	// Record should not be used across multiple calls to Next()
    23  	Record() []byte
    24  }
    25  
    26  type NoopWALReader struct{}
    27  
    28  func (NoopWALReader) Next() bool     { return false }
    29  func (NoopWALReader) Err() error     { return nil }
    30  func (NoopWALReader) Record() []byte { return nil }
    31  func (NoopWALReader) Close() error   { return nil }
    32  
    33  func newCheckpointReader(dir string) (WALReader, io.Closer, error) {
    34  	lastCheckpointDir, idx, err := lastCheckpoint(dir)
    35  	if err != nil {
    36  		return nil, nil, err
    37  	}
    38  	if idx < 0 {
    39  		level.Info(util_log.Logger).Log("msg", "no checkpoint found, treating as no-op")
    40  		var reader NoopWALReader
    41  		return reader, reader, nil
    42  	}
    43  
    44  	r, err := wal.NewSegmentsReader(lastCheckpointDir)
    45  	if err != nil {
    46  		return nil, nil, err
    47  	}
    48  	return wal.NewReader(r), r, nil
    49  }
    50  
    51  type Recoverer interface {
    52  	NumWorkers() int
    53  	Series(series *Series) error
    54  	SetStream(userID string, series record.RefSeries) error
    55  	Push(userID string, entries RefEntries) error
    56  	Done() <-chan struct{}
    57  }
    58  
    59  type ingesterRecoverer struct {
    60  	// basically map[userID]map[fingerprint]*stream
    61  	users sync.Map
    62  	ing   *Ingester
    63  
    64  	done chan struct{}
    65  }
    66  
    67  func newIngesterRecoverer(i *Ingester) *ingesterRecoverer {
    68  
    69  	return &ingesterRecoverer{
    70  		ing:  i,
    71  		done: make(chan struct{}),
    72  	}
    73  }
    74  
    75  // Use all available cores
    76  func (r *ingesterRecoverer) NumWorkers() int { return runtime.GOMAXPROCS(0) }
    77  
    78  func (r *ingesterRecoverer) Series(series *Series) error {
    79  	return r.ing.replayController.WithBackPressure(func() error {
    80  
    81  		inst, err := r.ing.GetOrCreateInstance(series.UserID)
    82  		if err != nil {
    83  			return err
    84  		}
    85  
    86  		// TODO(owen-d): create another fn to avoid unnecessary label type conversions.
    87  		stream, err := inst.getOrCreateStream(logproto.Stream{
    88  			Labels: logproto.FromLabelAdaptersToLabels(series.Labels).String(),
    89  		}, nil)
    90  
    91  		if err != nil {
    92  			return err
    93  		}
    94  
    95  		bytesAdded, entriesAdded, err := stream.setChunks(series.Chunks)
    96  		stream.lastLine.ts = series.To
    97  		stream.lastLine.content = series.LastLine
    98  		stream.entryCt = series.EntryCt
    99  		stream.highestTs = series.HighestTs
   100  
   101  		if err != nil {
   102  			return err
   103  		}
   104  		r.ing.metrics.memoryChunks.Add(float64(len(series.Chunks)))
   105  		r.ing.metrics.recoveredChunksTotal.Add(float64(len(series.Chunks)))
   106  		r.ing.metrics.recoveredEntriesTotal.Add(float64(entriesAdded))
   107  		r.ing.replayController.Add(int64(bytesAdded))
   108  
   109  		// now store the stream in the recovery map under the fingerprint originally recorded
   110  		// as it's possible the newly mapped fingerprint is different. This is because the WAL records
   111  		// will use this original reference.
   112  		got, _ := r.users.LoadOrStore(series.UserID, &sync.Map{})
   113  		streamsMap := got.(*sync.Map)
   114  		streamsMap.Store(chunks.HeadSeriesRef(series.Fingerprint), stream)
   115  
   116  		return nil
   117  	})
   118  }
   119  
   120  // SetStream is responsible for setting the key path for userIDs -> fingerprints -> streams.
   121  // Internally, this uses nested sync.Maps due to their performance benefits for sets that only grow.
   122  // Using these also allows us to bypass the ingester -> instance -> stream hierarchy internally, which
   123  // may yield some performance gains, but is essential for the following:
   124  // Due to the use of the instance's fingerprint mapper, stream fingerprints are NOT necessarily
   125  // deterministic. The WAL uses the post-mapped fingerprint on the ingester that originally
   126  // created the stream and we ensure that said fingerprint maps correctly to the newly
   127  // created stream during WAL replay, even if the new in memory stream was assigned a different
   128  // fingerprint from the mapper. This is paramount because subsequent WAL records will use
   129  // the fingerprint reported in the WAL record, not the potentially differing one assigned during
   130  // stream creation.
   131  func (r *ingesterRecoverer) SetStream(userID string, series record.RefSeries) error {
   132  	inst, err := r.ing.GetOrCreateInstance(userID)
   133  	if err != nil {
   134  		return err
   135  	}
   136  
   137  	stream, err := inst.getOrCreateStream(
   138  		logproto.Stream{
   139  			Labels: series.Labels.String(),
   140  		},
   141  		nil,
   142  	)
   143  	if err != nil {
   144  		return err
   145  	}
   146  
   147  	// Now that we have the stream, ensure that the userID -> fingerprint -> stream
   148  	// path is set properly.
   149  	got, _ := r.users.LoadOrStore(userID, &sync.Map{})
   150  	streamsMap := got.(*sync.Map)
   151  	streamsMap.Store(series.Ref, stream)
   152  	return nil
   153  }
   154  
   155  func (r *ingesterRecoverer) Push(userID string, entries RefEntries) error {
   156  	return r.ing.replayController.WithBackPressure(func() error {
   157  		out, ok := r.users.Load(userID)
   158  		if !ok {
   159  			return errors.Errorf("user (%s) not set during WAL replay", userID)
   160  		}
   161  
   162  		s, ok := out.(*sync.Map).Load(entries.Ref)
   163  		if !ok {
   164  			return errors.Errorf("stream (%d) not set during WAL replay for user (%s)", entries.Ref, userID)
   165  		}
   166  
   167  		// ignore out of order errors here (it's possible for a checkpoint to already have data from the wal segments)
   168  		bytesAdded, err := s.(*stream).Push(context.Background(), entries.Entries, nil, entries.Counter, true)
   169  		r.ing.replayController.Add(int64(bytesAdded))
   170  		if err != nil && err == ErrEntriesExist {
   171  			r.ing.metrics.duplicateEntriesTotal.Add(float64(len(entries.Entries)))
   172  		}
   173  		return nil
   174  	})
   175  }
   176  
   177  func (r *ingesterRecoverer) Close() {
   178  	// Ensure this is only run once.
   179  	select {
   180  	case <-r.done:
   181  		return
   182  	default:
   183  	}
   184  
   185  	close(r.done)
   186  
   187  	// Enable the limiter here to accurately reflect tenant limits after recovery.
   188  	r.ing.limiter.Enable()
   189  
   190  	for _, inst := range r.ing.getInstances() {
   191  		inst.forAllStreams(context.Background(), func(s *stream) error {
   192  			s.chunkMtx.Lock()
   193  			defer s.chunkMtx.Unlock()
   194  
   195  			// reset all the incrementing stream counters after a successful WAL replay.
   196  			s.resetCounter()
   197  
   198  			if len(s.chunks) == 0 {
   199  				inst.removeStream(s)
   200  				return nil
   201  			}
   202  
   203  			// If we've replayed a WAL with unordered writes, but the new
   204  			// configuration disables them, convert all streams/head blocks
   205  			// to ensure unordered writes are disabled after the replay,
   206  			// but without dropping any previously accepted data.
   207  			isAllowed := r.ing.limiter.UnorderedWrites(s.tenant)
   208  			old := s.unorderedWrites
   209  			s.unorderedWrites = isAllowed
   210  
   211  			if !isAllowed && old {
   212  				err := s.chunks[len(s.chunks)-1].chunk.ConvertHead(headBlockType(isAllowed))
   213  				if err != nil {
   214  					level.Warn(util_log.Logger).Log(
   215  						"msg", "error converting headblock",
   216  						"err", err.Error(),
   217  						"stream", s.labels.String(),
   218  						"component", "ingesterRecoverer",
   219  					)
   220  				}
   221  			}
   222  
   223  			return nil
   224  		})
   225  	}
   226  }
   227  
   228  func (r *ingesterRecoverer) Done() <-chan struct{} {
   229  	return r.done
   230  }
   231  
   232  func RecoverWAL(reader WALReader, recoverer Recoverer) error {
   233  	dispatch := func(recoverer Recoverer, b []byte, inputs []chan recoveryInput) error {
   234  		rec := recordPool.GetRecord()
   235  		if err := decodeWALRecord(b, rec); err != nil {
   236  			return err
   237  		}
   238  
   239  		// First process all series to ensure we don't write entries to nonexistant series.
   240  		var firstErr error
   241  		for _, s := range rec.Series {
   242  			if err := recoverer.SetStream(rec.UserID, s); err != nil {
   243  				if firstErr == nil {
   244  					firstErr = err
   245  				}
   246  			}
   247  
   248  		}
   249  
   250  		for _, entries := range rec.RefEntries {
   251  			worker := int(uint64(entries.Ref) % uint64(len(inputs)))
   252  			inputs[worker] <- recoveryInput{
   253  				userID: rec.UserID,
   254  				data:   entries,
   255  			}
   256  		}
   257  
   258  		return firstErr
   259  	}
   260  
   261  	process := func(recoverer Recoverer, input <-chan recoveryInput, errCh chan<- error) {
   262  		for {
   263  			select {
   264  			case <-recoverer.Done():
   265  
   266  			case next, ok := <-input:
   267  				if !ok {
   268  					return
   269  				}
   270  				entries, ok := next.data.(RefEntries)
   271  				var err error
   272  				if !ok {
   273  					err = errors.Errorf("unexpected type (%T) when recovering WAL, expecting (%T)", next.data, entries)
   274  				}
   275  				if err == nil {
   276  					err = recoverer.Push(next.userID, entries)
   277  				}
   278  
   279  				// Pass the error back, but respect the quit signal.
   280  				if err != nil {
   281  					errCh <- err
   282  				}
   283  			}
   284  		}
   285  	}
   286  
   287  	return recoverGeneric(
   288  		reader,
   289  		recoverer,
   290  		dispatch,
   291  		process,
   292  	)
   293  
   294  }
   295  
   296  func RecoverCheckpoint(reader WALReader, recoverer Recoverer) error {
   297  	dispatch := func(recoverer Recoverer, b []byte, inputs []chan recoveryInput) error {
   298  		s := &Series{}
   299  		if err := decodeCheckpointRecord(b, s); err != nil {
   300  			return err
   301  		}
   302  
   303  		worker := int(s.Fingerprint % uint64(len(inputs)))
   304  		inputs[worker] <- recoveryInput{
   305  			userID: s.UserID,
   306  			data:   s,
   307  		}
   308  		return nil
   309  	}
   310  
   311  	process := func(recoverer Recoverer, input <-chan recoveryInput, errCh chan<- error) {
   312  		for {
   313  			select {
   314  			case <-recoverer.Done():
   315  
   316  			case next, ok := <-input:
   317  				if !ok {
   318  					return
   319  				}
   320  				series, ok := next.data.(*Series)
   321  				var err error
   322  				if !ok {
   323  					err = errors.Errorf("unexpected type (%T) when recovering WAL, expecting (%T)", next.data, series)
   324  				}
   325  				if err == nil {
   326  					err = recoverer.Series(series)
   327  				}
   328  
   329  				if err != nil {
   330  					errCh <- err
   331  				}
   332  			}
   333  		}
   334  	}
   335  
   336  	return recoverGeneric(
   337  		reader,
   338  		recoverer,
   339  		dispatch,
   340  		process,
   341  	)
   342  }
   343  
   344  type recoveryInput struct {
   345  	userID string
   346  	data   interface{}
   347  }
   348  
   349  // recoverGeneric enables reusing the ability to recover from WALs of different types
   350  // by exposing the dispatch and process functions.
   351  // Note: it explicitly does not call the Recoverer.Close function as it's possible to layer
   352  // multiple recoveries on top of each other, as in the case of recovering from Checkpoints
   353  // then the WAL.
   354  func recoverGeneric(
   355  	reader WALReader,
   356  	recoverer Recoverer,
   357  	dispatch func(Recoverer, []byte, []chan recoveryInput) error,
   358  	process func(Recoverer, <-chan recoveryInput, chan<- error),
   359  ) error {
   360  	var wg sync.WaitGroup
   361  	var firstErr error
   362  	nWorkers := recoverer.NumWorkers()
   363  
   364  	if nWorkers < 1 {
   365  		return errors.New("cannot recover with no workers")
   366  	}
   367  
   368  	errCh := make(chan error)
   369  	inputs := make([]chan recoveryInput, 0, nWorkers)
   370  	wg.Add(nWorkers)
   371  	for i := 0; i < nWorkers; i++ {
   372  		inputs = append(inputs, make(chan recoveryInput))
   373  
   374  		go func(input <-chan recoveryInput) {
   375  			defer wg.Done()
   376  			process(recoverer, input, errCh)
   377  		}(inputs[i])
   378  
   379  	}
   380  
   381  	go func() {
   382  		for reader.Next() {
   383  			b := reader.Record()
   384  			if err := reader.Err(); err != nil {
   385  				errCh <- err
   386  				continue
   387  			}
   388  
   389  			if err := dispatch(recoverer, b, inputs); err != nil {
   390  				errCh <- err
   391  				continue
   392  			}
   393  		}
   394  
   395  		for _, w := range inputs {
   396  			close(w)
   397  		}
   398  	}()
   399  
   400  	finished := make(chan struct{})
   401  	go func(finished chan<- struct{}) {
   402  		wg.Wait()
   403  		finished <- struct{}{}
   404  	}(finished)
   405  
   406  	for {
   407  		select {
   408  		case <-finished:
   409  			return firstErr
   410  		case err := <-errCh:
   411  			if firstErr == nil {
   412  				firstErr = err
   413  			}
   414  		}
   415  	}
   416  }