github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/canary/comparator/comparator.go (about) 1 package comparator 2 3 import ( 4 "fmt" 5 "io" 6 "math/rand" 7 "sync" 8 "time" 9 10 "github.com/prometheus/client_golang/prometheus" 11 "github.com/prometheus/client_golang/prometheus/promauto" 12 "github.com/weaveworks/common/instrument" 13 14 "github.com/grafana/loki/pkg/canary/reader" 15 ) 16 17 const ( 18 ErrOutOfOrderEntry = "out of order entry %s was received before entries: %v\n" 19 ErrEntryNotReceivedWs = "websocket failed to receive entry %v within %f seconds\n" 20 ErrEntryNotReceived = "failed to receive entry %v within %f seconds\n" 21 ErrSpotCheckEntryNotReceived = "failed to find entry %v in Loki when spot check querying %v after it was written\n" 22 ErrDuplicateEntry = "received a duplicate entry for ts %v\n" 23 ErrUnexpectedEntry = "received an unexpected entry with ts %v\n" 24 DebugWebsocketMissingEntry = "websocket missing entry: %v\n" 25 DebugQueryResult = "confirmation query result: %v\n" 26 DebugEntryFound = "missing websocket entry %v was found %v seconds after it was originally sent\n" 27 ) 28 29 var ( 30 totalEntries = promauto.NewCounter(prometheus.CounterOpts{ 31 Namespace: "loki_canary", 32 Name: "entries_total", 33 Help: "counts log entries written to the file", 34 }) 35 outOfOrderEntries = promauto.NewCounter(prometheus.CounterOpts{ 36 Namespace: "loki_canary", 37 Name: "out_of_order_entries_total", 38 Help: "counts log entries received with a timestamp more recent than the others in the queue", 39 }) 40 wsMissingEntries = promauto.NewCounter(prometheus.CounterOpts{ 41 Namespace: "loki_canary", 42 Name: "websocket_missing_entries_total", 43 Help: "counts log entries not received within the wait duration via the websocket connection", 44 }) 45 missingEntries = promauto.NewCounter(prometheus.CounterOpts{ 46 Namespace: "loki_canary", 47 Name: "missing_entries_total", 48 Help: "counts log entries not received within the maxWait duration via both websocket and direct query", 49 }) 50 spotCheckMissing = promauto.NewCounter(prometheus.CounterOpts{ 51 Namespace: "loki_canary", 52 Name: "spot_check_missing_entries_total", 53 Help: "counts log entries not received when directly queried as part of spot checking", 54 }) 55 spotCheckEntries = promauto.NewCounter(prometheus.CounterOpts{ 56 Namespace: "loki_canary", 57 Name: "spot_check_entries_total", 58 Help: "total count of entries pot checked", 59 }) 60 unexpectedEntries = promauto.NewCounter(prometheus.CounterOpts{ 61 Namespace: "loki_canary", 62 Name: "unexpected_entries_total", 63 Help: "counts a log entry received which was not expected (e.g. received after reported missing)", 64 }) 65 duplicateEntries = promauto.NewCounter(prometheus.CounterOpts{ 66 Namespace: "loki_canary", 67 Name: "duplicate_entries_total", 68 Help: "counts a log entry received more than one time", 69 }) 70 metricTestExpected = promauto.NewGauge(prometheus.GaugeOpts{ 71 Namespace: "loki_canary", 72 Name: "metric_test_expected", 73 Help: "How many counts were expected by the metric test query", 74 }) 75 metricTestActual = promauto.NewGauge(prometheus.GaugeOpts{ 76 Namespace: "loki_canary", 77 Name: "metric_test_actual", 78 Help: "How many counts were actually received by the metric test query", 79 }) 80 responseLatency prometheus.Histogram 81 metricTestLatency = promauto.NewHistogram(prometheus.HistogramOpts{ 82 Namespace: "loki_canary", 83 Name: "metric_test_request_duration_seconds", 84 Help: "how long the metric test query execution took in seconds.", 85 Buckets: instrument.DefBuckets, 86 }) 87 spotTestLatency = promauto.NewHistogram(prometheus.HistogramOpts{ 88 Namespace: "loki_canary", 89 Name: "spot_check_request_duration_seconds", 90 Help: "how long the spot check test query execution took in seconds.", 91 Buckets: instrument.DefBuckets, 92 }) 93 ) 94 95 type Comparator struct { 96 entMtx sync.Mutex // Locks access to []entries and []ackdEntries 97 missingMtx sync.Mutex // Locks access to []missingEntries 98 spotEntMtx sync.Mutex // Locks access to []spotCheck 99 spotMtx sync.Mutex // Locks spotcheckRunning for single threaded but async spotCheck() 100 metTestMtx sync.Mutex // Locks metricTestRunning for single threaded but async metricTest() 101 pruneMtx sync.Mutex // Locks pruneEntriesRunning for single threaded but async pruneEntries() 102 w io.Writer 103 entries []*time.Time 104 missingEntries []*time.Time 105 spotCheck []*time.Time 106 ackdEntries []*time.Time 107 wait time.Duration 108 maxWait time.Duration 109 pruneInterval time.Duration 110 pruneEntriesRunning bool 111 spotCheckInterval time.Duration 112 spotCheckMax time.Duration 113 spotCheckQueryRate time.Duration 114 spotCheckWait time.Duration 115 spotCheckRunning bool 116 metricTestInterval time.Duration 117 metricTestRange time.Duration 118 metricTestRunning bool 119 writeInterval time.Duration 120 confirmAsync bool 121 startTime time.Time 122 sent chan time.Time 123 recv chan time.Time 124 rdr reader.LokiReader 125 quit chan struct{} 126 done chan struct{} 127 } 128 129 func NewComparator(writer io.Writer, 130 wait time.Duration, 131 maxWait time.Duration, 132 pruneInterval time.Duration, 133 spotCheckInterval, spotCheckMax, spotCheckQueryRate, spotCheckWait time.Duration, 134 metricTestInterval time.Duration, 135 metricTestRange time.Duration, 136 writeInterval time.Duration, 137 buckets int, 138 sentChan chan time.Time, 139 receivedChan chan time.Time, 140 reader reader.LokiReader, 141 confirmAsync bool) *Comparator { 142 c := &Comparator{ 143 w: writer, 144 entries: []*time.Time{}, 145 spotCheck: []*time.Time{}, 146 wait: wait, 147 maxWait: maxWait, 148 pruneInterval: pruneInterval, 149 pruneEntriesRunning: false, 150 spotCheckInterval: spotCheckInterval, 151 spotCheckMax: spotCheckMax, 152 spotCheckQueryRate: spotCheckQueryRate, 153 spotCheckWait: spotCheckWait, 154 spotCheckRunning: false, 155 metricTestInterval: metricTestInterval, 156 metricTestRange: metricTestRange, 157 metricTestRunning: false, 158 writeInterval: writeInterval, 159 confirmAsync: confirmAsync, 160 startTime: time.Now(), 161 sent: sentChan, 162 recv: receivedChan, 163 rdr: reader, 164 quit: make(chan struct{}), 165 done: make(chan struct{}), 166 } 167 168 if responseLatency == nil { 169 responseLatency = promauto.NewHistogram(prometheus.HistogramOpts{ 170 Namespace: "loki_canary", 171 Name: "response_latency_seconds", 172 Help: "is how long it takes for log lines to be returned from Loki in seconds.", 173 Buckets: prometheus.ExponentialBuckets(0.5, 2, buckets), 174 }) 175 } 176 177 go c.run() 178 179 return c 180 } 181 182 func (c *Comparator) Stop() { 183 if c.quit != nil { 184 close(c.quit) 185 <-c.done 186 c.quit = nil 187 } 188 } 189 190 func (c *Comparator) entrySent(ts time.Time) { 191 c.entMtx.Lock() 192 c.entries = append(c.entries, &ts) 193 totalEntries.Inc() 194 c.entMtx.Unlock() 195 //If this entry equals or exceeds the spot check interval from the last entry in the spot check array, add it. 196 c.spotEntMtx.Lock() 197 if len(c.spotCheck) == 0 || ts.Sub(*c.spotCheck[len(c.spotCheck)-1]) >= c.spotCheckInterval { 198 c.spotCheck = append(c.spotCheck, &ts) 199 } 200 c.spotEntMtx.Unlock() 201 } 202 203 // entryReceived removes the received entry from the buffer if it exists, reports on out of order entries received 204 func (c *Comparator) entryReceived(ts time.Time) { 205 c.entMtx.Lock() 206 defer c.entMtx.Unlock() 207 208 matched := false 209 c.entries = pruneList(c.entries, 210 func(_ int, t *time.Time) bool { 211 return ts.Equal(*t) 212 }, 213 func(i int, t *time.Time) { 214 matched = true 215 // If this isn't the first item in the list we received it out of order 216 if i != 0 { 217 outOfOrderEntries.Inc() 218 fmt.Fprintf(c.w, ErrOutOfOrderEntry, t, c.entries[:i]) 219 } 220 responseLatency.Observe(time.Since(ts).Seconds()) 221 // Put this element in the acknowledged entries list so we can use it to check for duplicates 222 c.ackdEntries = append(c.ackdEntries, c.entries[i]) 223 }) 224 225 if !matched { 226 duplicate := false 227 for _, e := range c.ackdEntries { 228 if ts.Equal(*e) { 229 duplicate = true 230 duplicateEntries.Inc() 231 fmt.Fprintf(c.w, ErrDuplicateEntry, ts.UnixNano()) 232 break 233 } 234 } 235 if !duplicate { 236 fmt.Fprintf(c.w, ErrUnexpectedEntry, ts.UnixNano()) 237 unexpectedEntries.Inc() 238 } 239 } 240 } 241 242 func (c *Comparator) Size() int { 243 c.entMtx.Lock() 244 defer c.entMtx.Unlock() 245 return len(c.entries) 246 } 247 248 func (c *Comparator) run() { 249 t := time.NewTicker(c.pruneInterval) 250 // Use a random tick up to the interval for the first tick 251 firstMt := true 252 rand.Seed(time.Now().UnixNano()) 253 mt := time.NewTicker(time.Duration(rand.Int63n(c.metricTestInterval.Nanoseconds()))) 254 sc := time.NewTicker(c.spotCheckQueryRate) 255 defer func() { 256 t.Stop() 257 mt.Stop() 258 sc.Stop() 259 close(c.done) 260 }() 261 262 for { 263 select { 264 case e := <-c.recv: 265 c.entryReceived(e) 266 case e := <-c.sent: 267 c.entrySent(e) 268 case <-t.C: 269 // Only run one instance of prune entries at a time. 270 c.pruneMtx.Lock() 271 if !c.pruneEntriesRunning { 272 c.pruneEntriesRunning = true 273 go c.pruneEntries(time.Now()) 274 } 275 c.pruneMtx.Unlock() 276 case <-sc.C: 277 // Only run one instance of spot check at a time. 278 c.spotMtx.Lock() 279 if !c.spotCheckRunning { 280 c.spotCheckRunning = true 281 go c.spotCheckEntries(time.Now()) 282 } 283 c.spotMtx.Unlock() 284 case <-mt.C: 285 // Only run one intstance of metric tests at a time. 286 c.metTestMtx.Lock() 287 if !c.metricTestRunning { 288 c.metricTestRunning = true 289 go c.metricTest(time.Now()) 290 } 291 c.metTestMtx.Unlock() 292 if firstMt { 293 // After the first tick which is at a random time, resume at the normal periodic interval 294 firstMt = false 295 mt.Reset(c.metricTestInterval) 296 } 297 case <-c.quit: 298 return 299 } 300 } 301 } 302 303 // check that the expected # of log lines have been written to Loki 304 func (c *Comparator) metricTest(currTime time.Time) { 305 // Always make sure to set the running state back to false 306 defer func() { 307 c.metTestMtx.Lock() 308 c.metricTestRunning = false 309 c.metTestMtx.Unlock() 310 }() 311 adjustedRange := c.metricTestRange 312 313 // Adjust the query range to not be longer than the canary has been running. 314 // We can't query for 24 hours of counts if it's only been running for 10m, 315 // so we adjusted the range to the run time until we reach the desired lookback time. 316 if currTime.Add(-c.metricTestRange).Before(c.startTime) { 317 adjustedRange = currTime.Sub(c.startTime) 318 } 319 begin := time.Now() 320 actualCount, err := c.rdr.QueryCountOverTime(fmt.Sprintf("%.0fs", adjustedRange.Seconds())) 321 metricTestLatency.Observe(time.Since(begin).Seconds()) 322 if err != nil { 323 fmt.Fprintf(c.w, "error running metric query test: %s\n", err.Error()) 324 return 325 } 326 expectedCount := float64(adjustedRange.Milliseconds()) / float64(c.writeInterval.Milliseconds()) 327 metricTestExpected.Set(expectedCount) 328 metricTestActual.Set(actualCount) 329 } 330 331 // spotCheck is used to ensure that log data is actually available after being flushed from the 332 // ingesters in memory storage and persisted to disk/blob storage, which the tail check cannot confirm. 333 // It keeps a sampled slice of log lines written by the canary and checks for them periodically until 334 // it decides they are too old to continue checking for. 335 func (c *Comparator) spotCheckEntries(currTime time.Time) { 336 // Always make sure to set the running state back to false 337 defer func() { 338 c.spotMtx.Lock() 339 c.spotCheckRunning = false 340 c.spotMtx.Unlock() 341 }() 342 c.spotEntMtx.Lock() 343 344 // Remove any entries from the spotcheck list which are too old 345 c.spotCheck = pruneList(c.spotCheck, 346 func(_ int, t *time.Time) bool { 347 return t.Before(currTime.Add(-c.spotCheckMax)) 348 }, 349 func(_ int, t *time.Time) { 350 351 }) 352 353 // Make a copy so we don't have to hold the lock to verify entries 354 cpy := make([]*time.Time, len(c.spotCheck)) 355 copy(cpy, c.spotCheck) 356 c.spotEntMtx.Unlock() 357 358 for _, sce := range cpy { 359 // Make sure enough time has passed to start checking for this entry 360 if currTime.Sub(*sce) < c.spotCheckWait { 361 continue 362 } 363 spotCheckEntries.Inc() 364 // Because we are querying loki timestamps vs the timestamp in the log, 365 // make the range +/- 10 seconds to allow for clock inaccuracies 366 start := *sce 367 adjustedStart := start.Add(-10 * time.Second) 368 adjustedEnd := start.Add(10 * time.Second) 369 begin := time.Now() 370 recvd, err := c.rdr.Query(adjustedStart, adjustedEnd) 371 spotTestLatency.Observe(time.Since(begin).Seconds()) 372 if err != nil { 373 fmt.Fprintf(c.w, "error querying loki: %s\n", err) 374 return 375 } 376 377 found := false 378 for _, r := range recvd { 379 if (*sce).Equal(r) { 380 found = true 381 break 382 } 383 } 384 if !found { 385 fmt.Fprintf(c.w, ErrSpotCheckEntryNotReceived, sce.UnixNano(), currTime.Sub(*sce)) 386 for _, r := range recvd { 387 fmt.Fprintf(c.w, DebugQueryResult, r.UnixNano()) 388 } 389 spotCheckMissing.Inc() 390 } 391 } 392 393 } 394 395 func (c *Comparator) pruneEntries(currentTime time.Time) { 396 // Always make sure to set the running state back to false 397 defer func() { 398 c.pruneMtx.Lock() 399 c.pruneEntriesRunning = false 400 c.pruneMtx.Unlock() 401 }() 402 c.entMtx.Lock() 403 defer c.entMtx.Unlock() 404 405 missing := []*time.Time{} 406 // Prune entry list of anything older than c.wait and add it to missing list 407 c.entries = pruneList(c.entries, 408 func(_ int, t *time.Time) bool { 409 return t.Before(currentTime.Add(-c.wait)) || t.Equal(currentTime.Add(-c.wait)) 410 }, 411 func(_ int, t *time.Time) { 412 missing = append(missing, t) 413 wsMissingEntries.Inc() 414 fmt.Fprintf(c.w, ErrEntryNotReceivedWs, t.UnixNano(), c.wait.Seconds()) 415 }) 416 417 // Add the list of missing entries to the list for which we will attempt to query Loki for 418 c.missingMtx.Lock() 419 c.missingEntries = append(c.missingEntries, missing...) 420 c.missingMtx.Unlock() 421 if len(c.missingEntries) > 0 { 422 if c.confirmAsync { 423 go c.confirmMissing(currentTime) 424 } else { 425 c.confirmMissing(currentTime) 426 } 427 428 } 429 430 // Prune c.ackdEntries list of old acknowledged entries which we were using to find duplicates 431 c.ackdEntries = pruneList(c.ackdEntries, 432 func(_ int, t *time.Time) bool { 433 return t.Before(currentTime.Add(-c.wait)) 434 }, 435 func(_ int, t *time.Time) { 436 437 }) 438 } 439 440 func (c *Comparator) confirmMissing(currentTime time.Time) { 441 // Because we are querying loki timestamps vs the timestamp in the log, 442 // make the range +/- 10 seconds to allow for clock inaccuracies 443 c.missingMtx.Lock() 444 if len(c.missingEntries) == 0 { 445 c.missingMtx.Unlock() 446 return 447 } 448 start := *c.missingEntries[0] 449 start = start.Add(-10 * time.Second) 450 end := *c.missingEntries[len(c.missingEntries)-1] 451 end = end.Add(10 * time.Second) 452 c.missingMtx.Unlock() 453 454 recvd, err := c.rdr.Query(start, end) 455 if err != nil { 456 fmt.Fprintf(c.w, "error querying loki: %s\n", err) 457 return 458 } 459 // Now that query has returned, take out the lock on the missingEntries list so we can modify it 460 // It's possible more entries were added to this list but that's ok, if they match something in the 461 // query result we will remove them, if they don't they won't be old enough yet to remove. 462 c.missingMtx.Lock() 463 defer c.missingMtx.Unlock() 464 465 // This is to help debug some missing log entries when queried, 466 // let's print exactly what we are missing and what Loki sent back 467 for _, r := range c.missingEntries { 468 fmt.Fprintf(c.w, DebugWebsocketMissingEntry, r.UnixNano()) 469 } 470 for _, r := range recvd { 471 fmt.Fprintf(c.w, DebugQueryResult, r.UnixNano()) 472 } 473 474 k := 0 475 for i, m := range c.missingEntries { 476 found := false 477 for _, r := range recvd { 478 if (*m).Equal(r) { 479 // Entry was found in loki, this can be dropped from the list of missing 480 // which is done by NOT incrementing the output index k 481 fmt.Fprintf(c.w, DebugEntryFound, (*m).UnixNano(), currentTime.Sub(*m).Seconds()) 482 found = true 483 } 484 } 485 if !found { 486 // Item is still missing 487 if i != k { 488 c.missingEntries[k] = c.missingEntries[i] 489 } 490 k++ 491 } 492 } 493 // Nil out the pointers to any trailing elements which were removed from the slice 494 for i := k; i < len(c.missingEntries); i++ { 495 c.missingEntries[i] = nil // or the zero value of T 496 } 497 c.missingEntries = c.missingEntries[:k] 498 499 // Remove entries from missing list which are older than maxWait 500 removed := []*time.Time{} 501 c.missingEntries = pruneList(c.missingEntries, 502 func(_ int, t *time.Time) bool { 503 return t.Before(currentTime.Add(-c.maxWait)) 504 }, 505 func(_ int, t *time.Time) { 506 removed = append(removed, t) 507 }) 508 509 // Record the entries which were removed and never received 510 for _, e := range removed { 511 missingEntries.Inc() 512 fmt.Fprintf(c.w, ErrEntryNotReceived, e.UnixNano(), c.maxWait.Seconds()) 513 } 514 } 515 516 func pruneList(list []*time.Time, shouldRemove func(int, *time.Time) bool, handleRemoved func(int, *time.Time)) []*time.Time { 517 // Prune the acknowledged list, remove anything older than our maxwait 518 k := 0 519 for i, e := range list { 520 if shouldRemove(i, e) { 521 handleRemoved(i, e) 522 // Do not increment output index k, causing this entry to be dropped 523 } else { 524 // If items were skipped, assign the kth element to the current item which is not skipped 525 if i != k { 526 list[k] = list[i] 527 } 528 // Increment k for the next output item 529 k++ 530 } 531 } 532 // Nil out the pointers to any trailing elements which were removed from the slice 533 for i := k; i < len(list); i++ { 534 list[i] = nil // or the zero value of T 535 } 536 // Reslice the list to the new size k 537 return list[:k] 538 539 }