github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/canary/comparator/comparator.go (about)

     1  package comparator
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"math/rand"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/prometheus/client_golang/prometheus"
    11  	"github.com/prometheus/client_golang/prometheus/promauto"
    12  	"github.com/weaveworks/common/instrument"
    13  
    14  	"github.com/grafana/loki/pkg/canary/reader"
    15  )
    16  
    17  const (
    18  	ErrOutOfOrderEntry           = "out of order entry %s was received before entries: %v\n"
    19  	ErrEntryNotReceivedWs        = "websocket failed to receive entry %v within %f seconds\n"
    20  	ErrEntryNotReceived          = "failed to receive entry %v within %f seconds\n"
    21  	ErrSpotCheckEntryNotReceived = "failed to find entry %v in Loki when spot check querying %v after it was written\n"
    22  	ErrDuplicateEntry            = "received a duplicate entry for ts %v\n"
    23  	ErrUnexpectedEntry           = "received an unexpected entry with ts %v\n"
    24  	DebugWebsocketMissingEntry   = "websocket missing entry: %v\n"
    25  	DebugQueryResult             = "confirmation query result: %v\n"
    26  	DebugEntryFound              = "missing websocket entry %v was found %v seconds after it was originally sent\n"
    27  )
    28  
    29  var (
    30  	totalEntries = promauto.NewCounter(prometheus.CounterOpts{
    31  		Namespace: "loki_canary",
    32  		Name:      "entries_total",
    33  		Help:      "counts log entries written to the file",
    34  	})
    35  	outOfOrderEntries = promauto.NewCounter(prometheus.CounterOpts{
    36  		Namespace: "loki_canary",
    37  		Name:      "out_of_order_entries_total",
    38  		Help:      "counts log entries received with a timestamp more recent than the others in the queue",
    39  	})
    40  	wsMissingEntries = promauto.NewCounter(prometheus.CounterOpts{
    41  		Namespace: "loki_canary",
    42  		Name:      "websocket_missing_entries_total",
    43  		Help:      "counts log entries not received within the wait duration via the websocket connection",
    44  	})
    45  	missingEntries = promauto.NewCounter(prometheus.CounterOpts{
    46  		Namespace: "loki_canary",
    47  		Name:      "missing_entries_total",
    48  		Help:      "counts log entries not received within the maxWait duration via both websocket and direct query",
    49  	})
    50  	spotCheckMissing = promauto.NewCounter(prometheus.CounterOpts{
    51  		Namespace: "loki_canary",
    52  		Name:      "spot_check_missing_entries_total",
    53  		Help:      "counts log entries not received when directly queried as part of spot checking",
    54  	})
    55  	spotCheckEntries = promauto.NewCounter(prometheus.CounterOpts{
    56  		Namespace: "loki_canary",
    57  		Name:      "spot_check_entries_total",
    58  		Help:      "total count of entries pot checked",
    59  	})
    60  	unexpectedEntries = promauto.NewCounter(prometheus.CounterOpts{
    61  		Namespace: "loki_canary",
    62  		Name:      "unexpected_entries_total",
    63  		Help:      "counts a log entry received which was not expected (e.g. received after reported missing)",
    64  	})
    65  	duplicateEntries = promauto.NewCounter(prometheus.CounterOpts{
    66  		Namespace: "loki_canary",
    67  		Name:      "duplicate_entries_total",
    68  		Help:      "counts a log entry received more than one time",
    69  	})
    70  	metricTestExpected = promauto.NewGauge(prometheus.GaugeOpts{
    71  		Namespace: "loki_canary",
    72  		Name:      "metric_test_expected",
    73  		Help:      "How many counts were expected by the metric test query",
    74  	})
    75  	metricTestActual = promauto.NewGauge(prometheus.GaugeOpts{
    76  		Namespace: "loki_canary",
    77  		Name:      "metric_test_actual",
    78  		Help:      "How many counts were actually received by the metric test query",
    79  	})
    80  	responseLatency   prometheus.Histogram
    81  	metricTestLatency = promauto.NewHistogram(prometheus.HistogramOpts{
    82  		Namespace: "loki_canary",
    83  		Name:      "metric_test_request_duration_seconds",
    84  		Help:      "how long the metric test query execution took in seconds.",
    85  		Buckets:   instrument.DefBuckets,
    86  	})
    87  	spotTestLatency = promauto.NewHistogram(prometheus.HistogramOpts{
    88  		Namespace: "loki_canary",
    89  		Name:      "spot_check_request_duration_seconds",
    90  		Help:      "how long the spot check test query execution took in seconds.",
    91  		Buckets:   instrument.DefBuckets,
    92  	})
    93  )
    94  
    95  type Comparator struct {
    96  	entMtx              sync.Mutex // Locks access to []entries and []ackdEntries
    97  	missingMtx          sync.Mutex // Locks access to []missingEntries
    98  	spotEntMtx          sync.Mutex // Locks access to []spotCheck
    99  	spotMtx             sync.Mutex // Locks spotcheckRunning for single threaded but async spotCheck()
   100  	metTestMtx          sync.Mutex // Locks metricTestRunning for single threaded but async metricTest()
   101  	pruneMtx            sync.Mutex // Locks pruneEntriesRunning for single threaded but async pruneEntries()
   102  	w                   io.Writer
   103  	entries             []*time.Time
   104  	missingEntries      []*time.Time
   105  	spotCheck           []*time.Time
   106  	ackdEntries         []*time.Time
   107  	wait                time.Duration
   108  	maxWait             time.Duration
   109  	pruneInterval       time.Duration
   110  	pruneEntriesRunning bool
   111  	spotCheckInterval   time.Duration
   112  	spotCheckMax        time.Duration
   113  	spotCheckQueryRate  time.Duration
   114  	spotCheckWait       time.Duration
   115  	spotCheckRunning    bool
   116  	metricTestInterval  time.Duration
   117  	metricTestRange     time.Duration
   118  	metricTestRunning   bool
   119  	writeInterval       time.Duration
   120  	confirmAsync        bool
   121  	startTime           time.Time
   122  	sent                chan time.Time
   123  	recv                chan time.Time
   124  	rdr                 reader.LokiReader
   125  	quit                chan struct{}
   126  	done                chan struct{}
   127  }
   128  
   129  func NewComparator(writer io.Writer,
   130  	wait time.Duration,
   131  	maxWait time.Duration,
   132  	pruneInterval time.Duration,
   133  	spotCheckInterval, spotCheckMax, spotCheckQueryRate, spotCheckWait time.Duration,
   134  	metricTestInterval time.Duration,
   135  	metricTestRange time.Duration,
   136  	writeInterval time.Duration,
   137  	buckets int,
   138  	sentChan chan time.Time,
   139  	receivedChan chan time.Time,
   140  	reader reader.LokiReader,
   141  	confirmAsync bool) *Comparator {
   142  	c := &Comparator{
   143  		w:                   writer,
   144  		entries:             []*time.Time{},
   145  		spotCheck:           []*time.Time{},
   146  		wait:                wait,
   147  		maxWait:             maxWait,
   148  		pruneInterval:       pruneInterval,
   149  		pruneEntriesRunning: false,
   150  		spotCheckInterval:   spotCheckInterval,
   151  		spotCheckMax:        spotCheckMax,
   152  		spotCheckQueryRate:  spotCheckQueryRate,
   153  		spotCheckWait:       spotCheckWait,
   154  		spotCheckRunning:    false,
   155  		metricTestInterval:  metricTestInterval,
   156  		metricTestRange:     metricTestRange,
   157  		metricTestRunning:   false,
   158  		writeInterval:       writeInterval,
   159  		confirmAsync:        confirmAsync,
   160  		startTime:           time.Now(),
   161  		sent:                sentChan,
   162  		recv:                receivedChan,
   163  		rdr:                 reader,
   164  		quit:                make(chan struct{}),
   165  		done:                make(chan struct{}),
   166  	}
   167  
   168  	if responseLatency == nil {
   169  		responseLatency = promauto.NewHistogram(prometheus.HistogramOpts{
   170  			Namespace: "loki_canary",
   171  			Name:      "response_latency_seconds",
   172  			Help:      "is how long it takes for log lines to be returned from Loki in seconds.",
   173  			Buckets:   prometheus.ExponentialBuckets(0.5, 2, buckets),
   174  		})
   175  	}
   176  
   177  	go c.run()
   178  
   179  	return c
   180  }
   181  
   182  func (c *Comparator) Stop() {
   183  	if c.quit != nil {
   184  		close(c.quit)
   185  		<-c.done
   186  		c.quit = nil
   187  	}
   188  }
   189  
   190  func (c *Comparator) entrySent(ts time.Time) {
   191  	c.entMtx.Lock()
   192  	c.entries = append(c.entries, &ts)
   193  	totalEntries.Inc()
   194  	c.entMtx.Unlock()
   195  	//If this entry equals or exceeds the spot check interval from the last entry in the spot check array, add it.
   196  	c.spotEntMtx.Lock()
   197  	if len(c.spotCheck) == 0 || ts.Sub(*c.spotCheck[len(c.spotCheck)-1]) >= c.spotCheckInterval {
   198  		c.spotCheck = append(c.spotCheck, &ts)
   199  	}
   200  	c.spotEntMtx.Unlock()
   201  }
   202  
   203  // entryReceived removes the received entry from the buffer if it exists, reports on out of order entries received
   204  func (c *Comparator) entryReceived(ts time.Time) {
   205  	c.entMtx.Lock()
   206  	defer c.entMtx.Unlock()
   207  
   208  	matched := false
   209  	c.entries = pruneList(c.entries,
   210  		func(_ int, t *time.Time) bool {
   211  			return ts.Equal(*t)
   212  		},
   213  		func(i int, t *time.Time) {
   214  			matched = true
   215  			// If this isn't the first item in the list we received it out of order
   216  			if i != 0 {
   217  				outOfOrderEntries.Inc()
   218  				fmt.Fprintf(c.w, ErrOutOfOrderEntry, t, c.entries[:i])
   219  			}
   220  			responseLatency.Observe(time.Since(ts).Seconds())
   221  			// Put this element in the acknowledged entries list so we can use it to check for duplicates
   222  			c.ackdEntries = append(c.ackdEntries, c.entries[i])
   223  		})
   224  
   225  	if !matched {
   226  		duplicate := false
   227  		for _, e := range c.ackdEntries {
   228  			if ts.Equal(*e) {
   229  				duplicate = true
   230  				duplicateEntries.Inc()
   231  				fmt.Fprintf(c.w, ErrDuplicateEntry, ts.UnixNano())
   232  				break
   233  			}
   234  		}
   235  		if !duplicate {
   236  			fmt.Fprintf(c.w, ErrUnexpectedEntry, ts.UnixNano())
   237  			unexpectedEntries.Inc()
   238  		}
   239  	}
   240  }
   241  
   242  func (c *Comparator) Size() int {
   243  	c.entMtx.Lock()
   244  	defer c.entMtx.Unlock()
   245  	return len(c.entries)
   246  }
   247  
   248  func (c *Comparator) run() {
   249  	t := time.NewTicker(c.pruneInterval)
   250  	// Use a random tick up to the interval for the first tick
   251  	firstMt := true
   252  	rand.Seed(time.Now().UnixNano())
   253  	mt := time.NewTicker(time.Duration(rand.Int63n(c.metricTestInterval.Nanoseconds())))
   254  	sc := time.NewTicker(c.spotCheckQueryRate)
   255  	defer func() {
   256  		t.Stop()
   257  		mt.Stop()
   258  		sc.Stop()
   259  		close(c.done)
   260  	}()
   261  
   262  	for {
   263  		select {
   264  		case e := <-c.recv:
   265  			c.entryReceived(e)
   266  		case e := <-c.sent:
   267  			c.entrySent(e)
   268  		case <-t.C:
   269  			// Only run one instance of prune entries at a time.
   270  			c.pruneMtx.Lock()
   271  			if !c.pruneEntriesRunning {
   272  				c.pruneEntriesRunning = true
   273  				go c.pruneEntries(time.Now())
   274  			}
   275  			c.pruneMtx.Unlock()
   276  		case <-sc.C:
   277  			// Only run one instance of spot check at a time.
   278  			c.spotMtx.Lock()
   279  			if !c.spotCheckRunning {
   280  				c.spotCheckRunning = true
   281  				go c.spotCheckEntries(time.Now())
   282  			}
   283  			c.spotMtx.Unlock()
   284  		case <-mt.C:
   285  			// Only run one intstance of metric tests at a time.
   286  			c.metTestMtx.Lock()
   287  			if !c.metricTestRunning {
   288  				c.metricTestRunning = true
   289  				go c.metricTest(time.Now())
   290  			}
   291  			c.metTestMtx.Unlock()
   292  			if firstMt {
   293  				// After the first tick which is at a random time, resume at the normal periodic interval
   294  				firstMt = false
   295  				mt.Reset(c.metricTestInterval)
   296  			}
   297  		case <-c.quit:
   298  			return
   299  		}
   300  	}
   301  }
   302  
   303  // check that the expected # of log lines have been written to Loki
   304  func (c *Comparator) metricTest(currTime time.Time) {
   305  	// Always make sure to set the running state back to false
   306  	defer func() {
   307  		c.metTestMtx.Lock()
   308  		c.metricTestRunning = false
   309  		c.metTestMtx.Unlock()
   310  	}()
   311  	adjustedRange := c.metricTestRange
   312  
   313  	// Adjust the query range to not be longer than the canary has been running.
   314  	// We can't query for 24 hours of counts if it's only been running for 10m,
   315  	// so we adjusted the range to the run time until we reach the desired lookback time.
   316  	if currTime.Add(-c.metricTestRange).Before(c.startTime) {
   317  		adjustedRange = currTime.Sub(c.startTime)
   318  	}
   319  	begin := time.Now()
   320  	actualCount, err := c.rdr.QueryCountOverTime(fmt.Sprintf("%.0fs", adjustedRange.Seconds()))
   321  	metricTestLatency.Observe(time.Since(begin).Seconds())
   322  	if err != nil {
   323  		fmt.Fprintf(c.w, "error running metric query test: %s\n", err.Error())
   324  		return
   325  	}
   326  	expectedCount := float64(adjustedRange.Milliseconds()) / float64(c.writeInterval.Milliseconds())
   327  	metricTestExpected.Set(expectedCount)
   328  	metricTestActual.Set(actualCount)
   329  }
   330  
   331  // spotCheck is used to ensure that log data is actually available after being flushed from the
   332  // ingesters in memory storage and  persisted to disk/blob storage, which the tail check cannot confirm.
   333  // It keeps a sampled slice of log lines written by the canary and checks for them periodically until
   334  // it decides they are too old to continue checking for.
   335  func (c *Comparator) spotCheckEntries(currTime time.Time) {
   336  	// Always make sure to set the running state back to false
   337  	defer func() {
   338  		c.spotMtx.Lock()
   339  		c.spotCheckRunning = false
   340  		c.spotMtx.Unlock()
   341  	}()
   342  	c.spotEntMtx.Lock()
   343  
   344  	// Remove any entries from the spotcheck list which are too old
   345  	c.spotCheck = pruneList(c.spotCheck,
   346  		func(_ int, t *time.Time) bool {
   347  			return t.Before(currTime.Add(-c.spotCheckMax))
   348  		},
   349  		func(_ int, t *time.Time) {
   350  
   351  		})
   352  
   353  	// Make a copy so we don't have to hold the lock to verify entries
   354  	cpy := make([]*time.Time, len(c.spotCheck))
   355  	copy(cpy, c.spotCheck)
   356  	c.spotEntMtx.Unlock()
   357  
   358  	for _, sce := range cpy {
   359  		// Make sure enough time has passed to start checking for this entry
   360  		if currTime.Sub(*sce) < c.spotCheckWait {
   361  			continue
   362  		}
   363  		spotCheckEntries.Inc()
   364  		// Because we are querying loki timestamps vs the timestamp in the log,
   365  		// make the range +/- 10 seconds to allow for clock inaccuracies
   366  		start := *sce
   367  		adjustedStart := start.Add(-10 * time.Second)
   368  		adjustedEnd := start.Add(10 * time.Second)
   369  		begin := time.Now()
   370  		recvd, err := c.rdr.Query(adjustedStart, adjustedEnd)
   371  		spotTestLatency.Observe(time.Since(begin).Seconds())
   372  		if err != nil {
   373  			fmt.Fprintf(c.w, "error querying loki: %s\n", err)
   374  			return
   375  		}
   376  
   377  		found := false
   378  		for _, r := range recvd {
   379  			if (*sce).Equal(r) {
   380  				found = true
   381  				break
   382  			}
   383  		}
   384  		if !found {
   385  			fmt.Fprintf(c.w, ErrSpotCheckEntryNotReceived, sce.UnixNano(), currTime.Sub(*sce))
   386  			for _, r := range recvd {
   387  				fmt.Fprintf(c.w, DebugQueryResult, r.UnixNano())
   388  			}
   389  			spotCheckMissing.Inc()
   390  		}
   391  	}
   392  
   393  }
   394  
   395  func (c *Comparator) pruneEntries(currentTime time.Time) {
   396  	// Always make sure to set the running state back to false
   397  	defer func() {
   398  		c.pruneMtx.Lock()
   399  		c.pruneEntriesRunning = false
   400  		c.pruneMtx.Unlock()
   401  	}()
   402  	c.entMtx.Lock()
   403  	defer c.entMtx.Unlock()
   404  
   405  	missing := []*time.Time{}
   406  	// Prune entry list of anything older than c.wait and add it to missing list
   407  	c.entries = pruneList(c.entries,
   408  		func(_ int, t *time.Time) bool {
   409  			return t.Before(currentTime.Add(-c.wait)) || t.Equal(currentTime.Add(-c.wait))
   410  		},
   411  		func(_ int, t *time.Time) {
   412  			missing = append(missing, t)
   413  			wsMissingEntries.Inc()
   414  			fmt.Fprintf(c.w, ErrEntryNotReceivedWs, t.UnixNano(), c.wait.Seconds())
   415  		})
   416  
   417  	// Add the list of missing entries to the list for which we will attempt to query Loki for
   418  	c.missingMtx.Lock()
   419  	c.missingEntries = append(c.missingEntries, missing...)
   420  	c.missingMtx.Unlock()
   421  	if len(c.missingEntries) > 0 {
   422  		if c.confirmAsync {
   423  			go c.confirmMissing(currentTime)
   424  		} else {
   425  			c.confirmMissing(currentTime)
   426  		}
   427  
   428  	}
   429  
   430  	// Prune c.ackdEntries list of old acknowledged entries which we were using to find duplicates
   431  	c.ackdEntries = pruneList(c.ackdEntries,
   432  		func(_ int, t *time.Time) bool {
   433  			return t.Before(currentTime.Add(-c.wait))
   434  		},
   435  		func(_ int, t *time.Time) {
   436  
   437  		})
   438  }
   439  
   440  func (c *Comparator) confirmMissing(currentTime time.Time) {
   441  	// Because we are querying loki timestamps vs the timestamp in the log,
   442  	// make the range +/- 10 seconds to allow for clock inaccuracies
   443  	c.missingMtx.Lock()
   444  	if len(c.missingEntries) == 0 {
   445  		c.missingMtx.Unlock()
   446  		return
   447  	}
   448  	start := *c.missingEntries[0]
   449  	start = start.Add(-10 * time.Second)
   450  	end := *c.missingEntries[len(c.missingEntries)-1]
   451  	end = end.Add(10 * time.Second)
   452  	c.missingMtx.Unlock()
   453  
   454  	recvd, err := c.rdr.Query(start, end)
   455  	if err != nil {
   456  		fmt.Fprintf(c.w, "error querying loki: %s\n", err)
   457  		return
   458  	}
   459  	// Now that query has returned, take out the lock on the missingEntries list so we can modify it
   460  	// It's possible more entries were added to this list but that's ok, if they match something in the
   461  	// query result we will remove them, if they don't they won't be old enough yet to remove.
   462  	c.missingMtx.Lock()
   463  	defer c.missingMtx.Unlock()
   464  
   465  	// This is to help debug some missing log entries when queried,
   466  	// let's print exactly what we are missing and what Loki sent back
   467  	for _, r := range c.missingEntries {
   468  		fmt.Fprintf(c.w, DebugWebsocketMissingEntry, r.UnixNano())
   469  	}
   470  	for _, r := range recvd {
   471  		fmt.Fprintf(c.w, DebugQueryResult, r.UnixNano())
   472  	}
   473  
   474  	k := 0
   475  	for i, m := range c.missingEntries {
   476  		found := false
   477  		for _, r := range recvd {
   478  			if (*m).Equal(r) {
   479  				// Entry was found in loki, this can be dropped from the list of missing
   480  				// which is done by NOT incrementing the output index k
   481  				fmt.Fprintf(c.w, DebugEntryFound, (*m).UnixNano(), currentTime.Sub(*m).Seconds())
   482  				found = true
   483  			}
   484  		}
   485  		if !found {
   486  			// Item is still missing
   487  			if i != k {
   488  				c.missingEntries[k] = c.missingEntries[i]
   489  			}
   490  			k++
   491  		}
   492  	}
   493  	// Nil out the pointers to any trailing elements which were removed from the slice
   494  	for i := k; i < len(c.missingEntries); i++ {
   495  		c.missingEntries[i] = nil // or the zero value of T
   496  	}
   497  	c.missingEntries = c.missingEntries[:k]
   498  
   499  	// Remove entries from missing list which are older than maxWait
   500  	removed := []*time.Time{}
   501  	c.missingEntries = pruneList(c.missingEntries,
   502  		func(_ int, t *time.Time) bool {
   503  			return t.Before(currentTime.Add(-c.maxWait))
   504  		},
   505  		func(_ int, t *time.Time) {
   506  			removed = append(removed, t)
   507  		})
   508  
   509  	// Record the entries which were removed and never received
   510  	for _, e := range removed {
   511  		missingEntries.Inc()
   512  		fmt.Fprintf(c.w, ErrEntryNotReceived, e.UnixNano(), c.maxWait.Seconds())
   513  	}
   514  }
   515  
   516  func pruneList(list []*time.Time, shouldRemove func(int, *time.Time) bool, handleRemoved func(int, *time.Time)) []*time.Time {
   517  	// Prune the acknowledged list, remove anything older than our maxwait
   518  	k := 0
   519  	for i, e := range list {
   520  		if shouldRemove(i, e) {
   521  			handleRemoved(i, e)
   522  			// Do not increment output index k, causing this entry to be dropped
   523  		} else {
   524  			// If items were skipped, assign the kth element to the current item which is not skipped
   525  			if i != k {
   526  				list[k] = list[i]
   527  			}
   528  			// Increment k for the next output item
   529  			k++
   530  		}
   531  	}
   532  	// Nil out the pointers to any trailing elements which were removed from the slice
   533  	for i := k; i < len(list); i++ {
   534  		list[i] = nil // or the zero value of T
   535  	}
   536  	// Reslice the list to the new size k
   537  	return list[:k]
   538  
   539  }