github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ingester/stream.go (about)

     1  package ingester
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"net/http"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/go-kit/log/level"
    12  	"github.com/pkg/errors"
    13  	"github.com/prometheus/common/model"
    14  	"github.com/prometheus/prometheus/model/labels"
    15  	"github.com/weaveworks/common/httpgrpc"
    16  
    17  	"github.com/grafana/loki/pkg/chunkenc"
    18  	"github.com/grafana/loki/pkg/iter"
    19  	"github.com/grafana/loki/pkg/logproto"
    20  	"github.com/grafana/loki/pkg/logql/log"
    21  	"github.com/grafana/loki/pkg/logqlmodel/stats"
    22  	"github.com/grafana/loki/pkg/util/flagext"
    23  	util_log "github.com/grafana/loki/pkg/util/log"
    24  	"github.com/grafana/loki/pkg/validation"
    25  )
    26  
    27  var ErrEntriesExist = errors.New("duplicate push - entries already exist")
    28  
    29  type line struct {
    30  	ts      time.Time
    31  	content string
    32  }
    33  
    34  type stream struct {
    35  	limiter *StreamRateLimiter
    36  	cfg     *Config
    37  	tenant  string
    38  	// Newest chunk at chunks[n-1].
    39  	// Not thread-safe; assume accesses to this are locked by caller.
    40  	chunks   []chunkDesc
    41  	fp       model.Fingerprint // possibly remapped fingerprint, used in the streams map
    42  	chunkMtx sync.RWMutex
    43  
    44  	labels       labels.Labels
    45  	labelsString string
    46  
    47  	// most recently pushed line. This is used to prevent duplicate pushes.
    48  	// It also determines chunk synchronization when unordered writes are disabled.
    49  	lastLine line
    50  
    51  	// keeps track of the highest timestamp accepted by the stream.
    52  	// This is used when unordered writes are enabled to cap the validity window
    53  	// of accepted writes and for chunk synchronization.
    54  	highestTs time.Time
    55  
    56  	metrics *ingesterMetrics
    57  
    58  	tailers   map[uint32]*tailer
    59  	tailerMtx sync.RWMutex
    60  
    61  	// entryCt is a counter which is incremented on each accepted entry.
    62  	// This allows us to discard WAL entries during replays which were
    63  	// already recovered via checkpoints. Historically out of order
    64  	// errors were used to detect this, but this counter has been
    65  	// introduced to facilitate removing the ordering constraint.
    66  	entryCt int64
    67  
    68  	unorderedWrites bool
    69  }
    70  
    71  type chunkDesc struct {
    72  	chunk   *chunkenc.MemChunk
    73  	closed  bool
    74  	synced  bool
    75  	flushed time.Time
    76  	reason  string
    77  
    78  	lastUpdated time.Time
    79  }
    80  
    81  type entryWithError struct {
    82  	entry *logproto.Entry
    83  	e     error
    84  }
    85  
    86  func newStream(cfg *Config, limits RateLimiterStrategy, tenant string, fp model.Fingerprint, labels labels.Labels, unorderedWrites bool, metrics *ingesterMetrics) *stream {
    87  	return &stream{
    88  		limiter:         NewStreamRateLimiter(limits, tenant, 10*time.Second),
    89  		cfg:             cfg,
    90  		fp:              fp,
    91  		labels:          labels,
    92  		labelsString:    labels.String(),
    93  		tailers:         map[uint32]*tailer{},
    94  		metrics:         metrics,
    95  		tenant:          tenant,
    96  		unorderedWrites: unorderedWrites,
    97  	}
    98  }
    99  
   100  // consumeChunk manually adds a chunk to the stream that was received during
   101  // ingester chunk transfer.
   102  // Must hold chunkMtx
   103  // DEPRECATED: chunk transfers are no longer suggested and remain for compatibility.
   104  func (s *stream) consumeChunk(_ context.Context, chunk *logproto.Chunk) error {
   105  	c, err := chunkenc.NewByteChunk(chunk.Data, s.cfg.BlockSize, s.cfg.TargetChunkSize)
   106  	if err != nil {
   107  		return err
   108  	}
   109  
   110  	s.chunks = append(s.chunks, chunkDesc{
   111  		chunk: c,
   112  	})
   113  	s.metrics.chunksCreatedTotal.Inc()
   114  	return nil
   115  }
   116  
   117  // setChunks is used during checkpoint recovery
   118  func (s *stream) setChunks(chunks []Chunk) (bytesAdded, entriesAdded int, err error) {
   119  	s.chunkMtx.Lock()
   120  	defer s.chunkMtx.Unlock()
   121  	chks, err := fromWireChunks(s.cfg, chunks)
   122  	if err != nil {
   123  		return 0, 0, err
   124  	}
   125  	s.chunks = chks
   126  	for _, c := range s.chunks {
   127  		entriesAdded += c.chunk.Size()
   128  		bytesAdded += c.chunk.UncompressedSize()
   129  	}
   130  	return bytesAdded, entriesAdded, nil
   131  }
   132  
   133  func (s *stream) NewChunk() *chunkenc.MemChunk {
   134  	return chunkenc.NewMemChunk(s.cfg.parsedEncoding, headBlockType(s.unorderedWrites), s.cfg.BlockSize, s.cfg.TargetChunkSize)
   135  }
   136  
   137  func (s *stream) Push(
   138  	ctx context.Context,
   139  	entries []logproto.Entry,
   140  	// WAL record to add push contents to.
   141  	// May be nil to disable this functionality.
   142  	record *WALRecord,
   143  	// Counter used in WAL replay to avoid duplicates.
   144  	// If this is non-zero, the stream will reject entries
   145  	// with a counter value less than or equal to it's own.
   146  	// It is set to zero and thus bypassed outside of WAL replays.
   147  	counter int64,
   148  	// Lock chunkMtx while pushing.
   149  	// If this is false, chunkMtx must be held outside Push.
   150  	lockChunk bool,
   151  ) (int, error) {
   152  	if lockChunk {
   153  		s.chunkMtx.Lock()
   154  		defer s.chunkMtx.Unlock()
   155  	}
   156  
   157  	isReplay := counter > 0
   158  	if isReplay && counter <= s.entryCt {
   159  		var byteCt int
   160  		for _, e := range entries {
   161  			byteCt += len(e.Line)
   162  		}
   163  
   164  		s.metrics.walReplaySamplesDropped.WithLabelValues(duplicateReason).Add(float64(len(entries)))
   165  		s.metrics.walReplayBytesDropped.WithLabelValues(duplicateReason).Add(float64(byteCt))
   166  		return 0, ErrEntriesExist
   167  	}
   168  
   169  	var bytesAdded int
   170  	prevNumChunks := len(s.chunks)
   171  	if prevNumChunks == 0 {
   172  		s.chunks = append(s.chunks, chunkDesc{
   173  			chunk: s.NewChunk(),
   174  		})
   175  		s.metrics.chunksCreatedTotal.Inc()
   176  		s.metrics.chunkCreatedStats.Inc(1)
   177  	}
   178  
   179  	var storedEntries []logproto.Entry
   180  	failedEntriesWithError := []entryWithError{}
   181  
   182  	var outOfOrderSamples, outOfOrderBytes int
   183  	var rateLimitedSamples, rateLimitedBytes int
   184  	defer func() {
   185  		if outOfOrderSamples > 0 {
   186  			name := validation.OutOfOrder
   187  			if s.unorderedWrites {
   188  				name = validation.TooFarBehind
   189  			}
   190  			validation.DiscardedSamples.WithLabelValues(name, s.tenant).Add(float64(outOfOrderSamples))
   191  			validation.DiscardedBytes.WithLabelValues(name, s.tenant).Add(float64(outOfOrderBytes))
   192  		}
   193  		if rateLimitedSamples > 0 {
   194  			validation.DiscardedSamples.WithLabelValues(validation.StreamRateLimit, s.tenant).Add(float64(rateLimitedSamples))
   195  			validation.DiscardedBytes.WithLabelValues(validation.StreamRateLimit, s.tenant).Add(float64(rateLimitedBytes))
   196  		}
   197  	}()
   198  
   199  	// This call uses a mutex under the hood, cache the result since we're checking the limit
   200  	// on each entry in the push (hot path) and we only use this value when logging entries
   201  	// over the rate limit.
   202  	limit := s.limiter.lim.Limit()
   203  
   204  	// Don't fail on the first append error - if samples are sent out of order,
   205  	// we still want to append the later ones.
   206  	for i := range entries {
   207  		// If this entry matches our last appended line's timestamp and contents,
   208  		// ignore it.
   209  		//
   210  		// This check is done at the stream level so it persists across cut and
   211  		// flushed chunks.
   212  		//
   213  		// NOTE: it's still possible for duplicates to be appended if a stream is
   214  		// deleted from inactivity.
   215  		if entries[i].Timestamp.Equal(s.lastLine.ts) && entries[i].Line == s.lastLine.content {
   216  			continue
   217  		}
   218  
   219  		chunk := &s.chunks[len(s.chunks)-1]
   220  		if chunk.closed || !chunk.chunk.SpaceFor(&entries[i]) || s.cutChunkForSynchronization(entries[i].Timestamp, s.highestTs, chunk, s.cfg.SyncPeriod, s.cfg.SyncMinUtilization) {
   221  			chunk = s.cutChunk(ctx)
   222  		}
   223  		// Check if this this should be rate limited.
   224  		now := time.Now()
   225  		if !s.limiter.AllowN(now, len(entries[i].Line)) {
   226  			failedEntriesWithError = append(failedEntriesWithError, entryWithError{&entries[i], &validation.ErrStreamRateLimit{RateLimit: flagext.ByteSize(limit), Labels: s.labelsString, Bytes: flagext.ByteSize(len(entries[i].Line))}})
   227  			rateLimitedSamples++
   228  			rateLimitedBytes += len(entries[i].Line)
   229  			continue
   230  		}
   231  
   232  		// The validity window for unordered writes is the highest timestamp present minus 1/2 * max-chunk-age.
   233  		cutoff := s.highestTs.Add(-s.cfg.MaxChunkAge / 2)
   234  		if !isReplay && s.unorderedWrites && !s.highestTs.IsZero() && cutoff.After(entries[i].Timestamp) {
   235  			failedEntriesWithError = append(failedEntriesWithError, entryWithError{&entries[i], chunkenc.ErrTooFarBehind(cutoff)})
   236  			outOfOrderSamples++
   237  			outOfOrderBytes += len(entries[i].Line)
   238  		} else if err := chunk.chunk.Append(&entries[i]); err != nil {
   239  			failedEntriesWithError = append(failedEntriesWithError, entryWithError{&entries[i], err})
   240  			if chunkenc.IsOutOfOrderErr(err) {
   241  				outOfOrderSamples++
   242  				outOfOrderBytes += len(entries[i].Line)
   243  			}
   244  		} else {
   245  			storedEntries = append(storedEntries, entries[i])
   246  			s.lastLine.ts = entries[i].Timestamp
   247  			s.lastLine.content = entries[i].Line
   248  			if s.highestTs.Before(entries[i].Timestamp) {
   249  				s.highestTs = entries[i].Timestamp
   250  			}
   251  			s.entryCt++
   252  
   253  			// length of string plus
   254  			bytesAdded += len(entries[i].Line)
   255  		}
   256  		chunk.lastUpdated = time.Now()
   257  	}
   258  
   259  	if len(storedEntries) != 0 {
   260  		// record will be nil when replaying the wal (we don't want to rewrite wal entries as we replay them).
   261  		if record != nil {
   262  			record.AddEntries(uint64(s.fp), s.entryCt, storedEntries...)
   263  		} else {
   264  			// If record is nil, this is a WAL recovery.
   265  			s.metrics.recoveredEntriesTotal.Add(float64(len(storedEntries)))
   266  		}
   267  
   268  		s.tailerMtx.RLock()
   269  		hasTailers := len(s.tailers) != 0
   270  		s.tailerMtx.RUnlock()
   271  		if hasTailers {
   272  			go func() {
   273  				stream := logproto.Stream{Labels: s.labelsString, Entries: storedEntries}
   274  
   275  				closedTailers := []uint32{}
   276  
   277  				s.tailerMtx.RLock()
   278  				for _, tailer := range s.tailers {
   279  					if tailer.isClosed() {
   280  						closedTailers = append(closedTailers, tailer.getID())
   281  						continue
   282  					}
   283  					tailer.send(stream, s.labels)
   284  				}
   285  				s.tailerMtx.RUnlock()
   286  
   287  				if len(closedTailers) != 0 {
   288  					s.tailerMtx.Lock()
   289  					defer s.tailerMtx.Unlock()
   290  
   291  					for _, closedTailerID := range closedTailers {
   292  						delete(s.tailers, closedTailerID)
   293  					}
   294  				}
   295  			}()
   296  		}
   297  	}
   298  
   299  	if len(s.chunks) != prevNumChunks {
   300  		s.metrics.memoryChunks.Add(float64(len(s.chunks) - prevNumChunks))
   301  	}
   302  
   303  	if len(failedEntriesWithError) > 0 {
   304  		lastEntryWithErr := failedEntriesWithError[len(failedEntriesWithError)-1]
   305  		_, ok := lastEntryWithErr.e.(*validation.ErrStreamRateLimit)
   306  		outOfOrder := chunkenc.IsOutOfOrderErr(lastEntryWithErr.e)
   307  		if !outOfOrder && !ok {
   308  			return bytesAdded, lastEntryWithErr.e
   309  		}
   310  		var statusCode int
   311  		if outOfOrder {
   312  			statusCode = http.StatusBadRequest
   313  		}
   314  		if ok {
   315  			statusCode = http.StatusTooManyRequests
   316  		}
   317  		// Return a http status 4xx request response with all failed entries.
   318  		buf := bytes.Buffer{}
   319  		streamName := s.labelsString
   320  
   321  		limitedFailedEntries := failedEntriesWithError
   322  		if maxIgnore := s.cfg.MaxReturnedErrors; maxIgnore > 0 && len(limitedFailedEntries) > maxIgnore {
   323  			limitedFailedEntries = limitedFailedEntries[:maxIgnore]
   324  		}
   325  
   326  		for _, entryWithError := range limitedFailedEntries {
   327  			fmt.Fprintf(&buf,
   328  				"entry with timestamp %s ignored, reason: '%s' for stream: %s,\n",
   329  				entryWithError.entry.Timestamp.String(), entryWithError.e.Error(), streamName)
   330  		}
   331  
   332  		fmt.Fprintf(&buf, "total ignored: %d out of %d", len(failedEntriesWithError), len(entries))
   333  
   334  		return bytesAdded, httpgrpc.Errorf(statusCode, buf.String())
   335  	}
   336  
   337  	return bytesAdded, nil
   338  }
   339  
   340  func (s *stream) cutChunk(ctx context.Context) *chunkDesc {
   341  	// If the chunk has no more space call Close to make sure anything in the head block is cut and compressed
   342  	chunk := &s.chunks[len(s.chunks)-1]
   343  	err := chunk.chunk.Close()
   344  	if err != nil {
   345  		// This should be an unlikely situation, returning an error up the stack doesn't help much here
   346  		// so instead log this to help debug the issue if it ever arises.
   347  		level.Error(util_log.WithContext(ctx, util_log.Logger)).Log("msg", "failed to Close chunk", "err", err)
   348  	}
   349  	chunk.closed = true
   350  
   351  	s.metrics.samplesPerChunk.Observe(float64(chunk.chunk.Size()))
   352  	s.metrics.blocksPerChunk.Observe(float64(chunk.chunk.BlockCount()))
   353  	s.metrics.chunksCreatedTotal.Inc()
   354  	s.metrics.chunkCreatedStats.Inc(1)
   355  
   356  	s.chunks = append(s.chunks, chunkDesc{
   357  		chunk: s.NewChunk(),
   358  	})
   359  	return &s.chunks[len(s.chunks)-1]
   360  }
   361  
   362  // Returns true, if chunk should be cut before adding new entry. This is done to make ingesters
   363  // cut the chunk for this stream at the same moment, so that new chunk will contain exactly the same entries.
   364  func (s *stream) cutChunkForSynchronization(entryTimestamp, latestTs time.Time, c *chunkDesc, synchronizePeriod time.Duration, minUtilization float64) bool {
   365  	// Never sync when it's not enabled, it's the first push, or if a write isn't the latest ts
   366  	// to prevent syncing many unordered writes.
   367  	if synchronizePeriod <= 0 || latestTs.IsZero() || latestTs.After(entryTimestamp) {
   368  		return false
   369  	}
   370  
   371  	// we use fingerprint as a jitter here, basically offsetting stream synchronization points to different
   372  	// this breaks if streams are mapped to different fingerprints on different ingesters, which is too bad.
   373  	cts := (uint64(entryTimestamp.UnixNano()) + uint64(s.fp)) % uint64(synchronizePeriod.Nanoseconds())
   374  	pts := (uint64(latestTs.UnixNano()) + uint64(s.fp)) % uint64(synchronizePeriod.Nanoseconds())
   375  
   376  	// if current entry timestamp has rolled over synchronization period
   377  	if cts < pts {
   378  		if minUtilization <= 0 {
   379  			c.synced = true
   380  			return true
   381  		}
   382  
   383  		if c.chunk.Utilization() > minUtilization {
   384  			c.synced = true
   385  			return true
   386  		}
   387  	}
   388  
   389  	return false
   390  }
   391  
   392  func (s *stream) Bounds() (from, to time.Time) {
   393  	s.chunkMtx.RLock()
   394  	defer s.chunkMtx.RUnlock()
   395  	if len(s.chunks) > 0 {
   396  		from, _ = s.chunks[0].chunk.Bounds()
   397  		_, to = s.chunks[len(s.chunks)-1].chunk.Bounds()
   398  	}
   399  	return from, to
   400  }
   401  
   402  // Returns an iterator.
   403  func (s *stream) Iterator(ctx context.Context, statsCtx *stats.Context, from, through time.Time, direction logproto.Direction, pipeline log.StreamPipeline) (iter.EntryIterator, error) {
   404  	s.chunkMtx.RLock()
   405  	defer s.chunkMtx.RUnlock()
   406  	iterators := make([]iter.EntryIterator, 0, len(s.chunks))
   407  
   408  	var lastMax time.Time
   409  	ordered := true
   410  
   411  	for _, c := range s.chunks {
   412  		mint, maxt := c.chunk.Bounds()
   413  
   414  		// skip this chunk
   415  		if through.Before(mint) || maxt.Before(from) {
   416  			continue
   417  		}
   418  
   419  		if mint.Before(lastMax) {
   420  			ordered = false
   421  		}
   422  		lastMax = maxt
   423  
   424  		itr, err := c.chunk.Iterator(ctx, from, through, direction, pipeline)
   425  		if err != nil {
   426  			return nil, err
   427  		}
   428  		if itr != nil {
   429  			iterators = append(iterators, itr)
   430  		}
   431  	}
   432  
   433  	if direction != logproto.FORWARD {
   434  		for left, right := 0, len(iterators)-1; left < right; left, right = left+1, right-1 {
   435  			iterators[left], iterators[right] = iterators[right], iterators[left]
   436  		}
   437  	}
   438  
   439  	if statsCtx != nil {
   440  		statsCtx.AddIngesterTotalChunkMatched(int64(len(iterators)))
   441  	}
   442  
   443  	if ordered {
   444  		return iter.NewNonOverlappingIterator(iterators), nil
   445  	}
   446  	return iter.NewSortEntryIterator(iterators, direction), nil
   447  }
   448  
   449  // Returns an SampleIterator.
   450  func (s *stream) SampleIterator(ctx context.Context, statsCtx *stats.Context, from, through time.Time, extractor log.StreamSampleExtractor) (iter.SampleIterator, error) {
   451  	s.chunkMtx.RLock()
   452  	defer s.chunkMtx.RUnlock()
   453  	iterators := make([]iter.SampleIterator, 0, len(s.chunks))
   454  
   455  	var lastMax time.Time
   456  	ordered := true
   457  
   458  	for _, c := range s.chunks {
   459  		mint, maxt := c.chunk.Bounds()
   460  
   461  		// skip this chunk
   462  		if through.Before(mint) || maxt.Before(from) {
   463  			continue
   464  		}
   465  
   466  		if mint.Before(lastMax) {
   467  			ordered = false
   468  		}
   469  		lastMax = maxt
   470  
   471  		if itr := c.chunk.SampleIterator(ctx, from, through, extractor); itr != nil {
   472  			iterators = append(iterators, itr)
   473  		}
   474  	}
   475  
   476  	if statsCtx != nil {
   477  		statsCtx.AddIngesterTotalChunkMatched(int64(len(iterators)))
   478  	}
   479  
   480  	if ordered {
   481  		return iter.NewNonOverlappingSampleIterator(iterators), nil
   482  	}
   483  	return iter.NewSortSampleIterator(iterators), nil
   484  }
   485  
   486  func (s *stream) addTailer(t *tailer) {
   487  	s.tailerMtx.Lock()
   488  	defer s.tailerMtx.Unlock()
   489  
   490  	s.tailers[t.getID()] = t
   491  }
   492  
   493  func (s *stream) resetCounter() {
   494  	s.entryCt = 0
   495  }
   496  
   497  func headBlockType(unorderedWrites bool) chunkenc.HeadBlockFmt {
   498  	if unorderedWrites {
   499  		return chunkenc.UnorderedHeadBlockFmt
   500  	}
   501  	return chunkenc.OrderedHeadBlockFmt
   502  }