github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ingester/checkpoint.go (about) 1 package ingester 2 3 import ( 4 "bytes" 5 "context" 6 fmt "fmt" 7 "io/ioutil" 8 "os" 9 "path/filepath" 10 "regexp" 11 "strconv" 12 "time" 13 14 "github.com/dustin/go-humanize" 15 "github.com/go-kit/log" 16 "github.com/go-kit/log/level" 17 "github.com/gogo/protobuf/proto" 18 "github.com/pkg/errors" 19 tsdb_errors "github.com/prometheus/prometheus/tsdb/errors" 20 "github.com/prometheus/prometheus/tsdb/fileutil" 21 "github.com/prometheus/prometheus/tsdb/wal" 22 prompool "github.com/prometheus/prometheus/util/pool" 23 24 "github.com/grafana/loki/pkg/chunkenc" 25 "github.com/grafana/loki/pkg/logproto" 26 util_log "github.com/grafana/loki/pkg/util/log" 27 "github.com/grafana/loki/pkg/util/pool" 28 ) 29 30 var ( 31 // todo(ctovena) those pools should be in factor of the actual configuration (blocksize, targetsize). 32 // Starting with something sane first then we can refine with more experience. 33 34 // Buckets [1KB 2KB 4KB 16KB 32KB to 4MB] by 2 35 chunksBufferPool = pool.NewBuffer(1024, 4*1024*1024, 2) 36 // Buckets [64B 128B 256B 512B... to 2MB] by 2 37 headBufferPool = pool.NewBuffer(64, 2*1024*1024, 2) 38 ) 39 40 type chunkWithBuffer struct { 41 blocks, head *bytes.Buffer 42 Chunk 43 } 44 45 // The passed wireChunks slice is for re-use. 46 func toWireChunks(descs []chunkDesc, wireChunks []chunkWithBuffer) ([]chunkWithBuffer, error) { 47 // release memory from previous list of chunks. 48 for _, wc := range wireChunks { 49 chunksBufferPool.Put(wc.blocks) 50 headBufferPool.Put(wc.head) 51 wc.Data = nil 52 wc.Head = nil 53 } 54 55 if cap(wireChunks) < len(descs) { 56 wireChunks = make([]chunkWithBuffer, len(descs)) 57 } else { 58 wireChunks = wireChunks[:len(descs)] 59 } 60 61 for i, d := range descs { 62 from, to := d.chunk.Bounds() 63 chunkSize, headSize := d.chunk.CheckpointSize() 64 65 wireChunk := chunkWithBuffer{ 66 Chunk: Chunk{ 67 From: from, 68 To: to, 69 Closed: d.closed, 70 FlushedAt: d.flushed, 71 LastUpdated: d.lastUpdated, 72 Synced: d.synced, 73 }, 74 blocks: chunksBufferPool.Get(chunkSize), 75 head: headBufferPool.Get(headSize), 76 } 77 78 err := d.chunk.SerializeForCheckpointTo( 79 wireChunk.blocks, 80 wireChunk.head, 81 ) 82 if err != nil { 83 return nil, err 84 } 85 86 wireChunk.Data = wireChunk.blocks.Bytes() 87 wireChunk.Head = wireChunk.head.Bytes() 88 wireChunks[i] = wireChunk 89 } 90 return wireChunks, nil 91 } 92 93 func fromWireChunks(conf *Config, wireChunks []Chunk) ([]chunkDesc, error) { 94 descs := make([]chunkDesc, 0, len(wireChunks)) 95 for _, c := range wireChunks { 96 desc := chunkDesc{ 97 closed: c.Closed, 98 synced: c.Synced, 99 flushed: c.FlushedAt, 100 lastUpdated: c.LastUpdated, 101 } 102 103 // Always use Unordered headblocks during replay 104 // to ensure Loki can effectively replay an unordered-friendly 105 // WAL into a new configuration that disables unordered writes. 106 hbType := chunkenc.UnorderedHeadBlockFmt 107 mc, err := chunkenc.MemchunkFromCheckpoint(c.Data, c.Head, hbType, conf.BlockSize, conf.TargetChunkSize) 108 if err != nil { 109 return nil, err 110 } 111 desc.chunk = mc 112 113 descs = append(descs, desc) 114 } 115 return descs, nil 116 } 117 118 // nolint:interfacer 119 func decodeCheckpointRecord(rec []byte, s *Series) error { 120 // TODO(owen-d): reduce allocs 121 // The proto unmarshaling code will retain references to the underlying []byte it's passed 122 // in order to reduce allocs. This is harmful to us because when reading from a WAL, the []byte 123 // is only guaranteed to be valid between calls to Next(). 124 // Therefore, we copy it to avoid this problem. 125 cpy := make([]byte, len(rec)) 126 copy(cpy, rec) 127 128 switch RecordType(cpy[0]) { 129 case CheckpointRecord: 130 return proto.Unmarshal(cpy[1:], s) 131 default: 132 return errors.Errorf("unexpected record type: %d", rec[0]) 133 } 134 } 135 136 func encodeWithTypeHeader(m *Series, typ RecordType, buf []byte) ([]byte, error) { 137 size := m.Size() 138 if cap(buf) < size+1 { 139 buf = make([]byte, size+1) 140 } 141 _, err := m.MarshalTo(buf[1 : size+1]) 142 if err != nil { 143 return nil, err 144 } 145 buf[0] = byte(typ) 146 return buf[:size+1], nil 147 } 148 149 type SeriesWithErr struct { 150 Err error 151 Series *Series 152 } 153 154 type SeriesIter interface { 155 Count() int 156 Iter() *streamIterator 157 Stop() 158 } 159 160 type ingesterSeriesIter struct { 161 ing ingesterInstances 162 163 done chan struct{} 164 } 165 166 type ingesterInstances interface { 167 getInstances() []*instance 168 } 169 170 func newIngesterSeriesIter(ing ingesterInstances) *ingesterSeriesIter { 171 return &ingesterSeriesIter{ 172 ing: ing, 173 done: make(chan struct{}), 174 } 175 } 176 177 func (i *ingesterSeriesIter) Count() (ct int) { 178 for _, inst := range i.ing.getInstances() { 179 ct += inst.numStreams() 180 } 181 return ct 182 } 183 184 func (i *ingesterSeriesIter) Stop() { 185 close(i.done) 186 } 187 188 func (i *ingesterSeriesIter) Iter() *streamIterator { 189 return newStreamsIterator(i.ing) 190 } 191 192 type streamInstance struct { 193 id string 194 streams []*stream 195 } 196 197 type streamIterator struct { 198 instances []streamInstance 199 200 current Series 201 buffer []chunkWithBuffer 202 err error 203 } 204 205 // newStreamsIterator returns a new stream iterators that iterates over one instance at a time, then 206 // each stream per instances. 207 func newStreamsIterator(ing ingesterInstances) *streamIterator { 208 instances := ing.getInstances() 209 streamInstances := make([]streamInstance, len(instances)) 210 for i, inst := range instances { 211 streams := make([]*stream, 0, inst.streams.Len()) 212 _ = inst.forAllStreams(context.Background(), func(s *stream) error { 213 streams = append(streams, s) 214 return nil 215 }) 216 streamInstances[i] = streamInstance{ 217 streams: streams, 218 id: inst.instanceID, 219 } 220 } 221 return &streamIterator{ 222 instances: streamInstances, 223 } 224 } 225 226 // Next loads the next stream of the current instance. 227 // If the instance is empty, it moves to the next instance until there is no more. 228 // Return true if there's a next stream, each successful calls will replace the current stream. 229 func (s *streamIterator) Next() bool { 230 if len(s.instances) == 0 { 231 s.instances = nil 232 return false 233 } 234 currentInstance := s.instances[0] 235 if len(currentInstance.streams) == 0 { 236 s.instances = s.instances[1:] 237 return s.Next() 238 } 239 240 // current stream 241 stream := currentInstance.streams[0] 242 243 // remove the first stream 244 s.instances[0].streams = s.instances[0].streams[1:] 245 246 stream.chunkMtx.RLock() 247 defer stream.chunkMtx.RUnlock() 248 249 if len(stream.chunks) < 1 { 250 // it's possible the stream has been flushed to storage 251 // in between starting the checkpointing process and 252 // checkpointing this stream. 253 return s.Next() 254 } 255 chunks, err := toWireChunks(stream.chunks, s.buffer) 256 if err != nil { 257 s.err = err 258 return false 259 } 260 s.buffer = chunks 261 262 s.current.Chunks = s.current.Chunks[:0] 263 if cap(s.current.Chunks) == 0 { 264 s.current.Chunks = make([]Chunk, 0, len(chunks)) 265 } 266 267 for _, c := range chunks { 268 s.current.Chunks = append(s.current.Chunks, c.Chunk) 269 } 270 271 s.current.UserID = currentInstance.id 272 s.current.Fingerprint = uint64(stream.fp) 273 s.current.Labels = logproto.FromLabelsToLabelAdapters(stream.labels) 274 275 s.current.To = stream.lastLine.ts 276 s.current.LastLine = stream.lastLine.content 277 s.current.EntryCt = stream.entryCt 278 s.current.HighestTs = stream.highestTs 279 280 return true 281 } 282 283 // Err returns an errors thrown while iterating over the streams. 284 func (s *streamIterator) Error() error { 285 return s.err 286 } 287 288 // Stream is serializable (for checkpointing) stream of chunks. 289 // NOTE: the series is re-used between successful Next calls. 290 // This means you should make a copy or use the data before calling Next. 291 func (s *streamIterator) Stream() *Series { 292 return &s.current 293 } 294 295 type CheckpointWriter interface { 296 // Advances current checkpoint, can also signal a no-op. 297 Advance() (noop bool, err error) 298 Write(*Series) error 299 // Closes current checkpoint. 300 Close(abort bool) error 301 } 302 303 type walLogger interface { 304 Log(recs ...[]byte) error 305 Close() error 306 Dir() string 307 } 308 309 type WALCheckpointWriter struct { 310 metrics *ingesterMetrics 311 segmentWAL *wal.WAL 312 313 checkpointWAL walLogger 314 lastSegment int // name of the last segment guaranteed to be covered by the checkpoint 315 final string // filename to atomically rotate upon completion 316 bufSize int 317 recs [][]byte 318 } 319 320 func (w *WALCheckpointWriter) Advance() (bool, error) { 321 _, lastSegment, err := wal.Segments(w.segmentWAL.Dir()) 322 if err != nil { 323 return false, err 324 } 325 326 if lastSegment < 0 { 327 // There are no WAL segments. No need of checkpoint yet. 328 return true, nil 329 } 330 331 // First we advance the wal segment internally to ensure we don't overlap a previous checkpoint in 332 // low throughput scenarios and to minimize segment replays on top of checkpoints. 333 if err := w.segmentWAL.NextSegment(); err != nil { 334 return false, err 335 } 336 337 // Checkpoint is named after the last WAL segment present so that when replaying the WAL 338 // we can start from that particular WAL segment. 339 checkpointDir := filepath.Join(w.segmentWAL.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", lastSegment)) 340 level.Info(util_log.Logger).Log("msg", "attempting checkpoint for", "dir", checkpointDir) 341 checkpointDirTemp := checkpointDir + ".tmp" 342 343 // cleanup any old partial checkpoints 344 if _, err := os.Stat(checkpointDirTemp); err == nil { 345 if err := os.RemoveAll(checkpointDirTemp); err != nil { 346 level.Error(util_log.Logger).Log("msg", "unable to cleanup old tmp checkpoint", "dir", checkpointDirTemp) 347 return false, err 348 } 349 } 350 351 if err := os.MkdirAll(checkpointDirTemp, 0777); err != nil { 352 return false, errors.Wrap(err, "create checkpoint dir") 353 } 354 355 checkpoint, err := wal.NewSize(log.With(util_log.Logger, "component", "checkpoint_wal"), nil, checkpointDirTemp, walSegmentSize, false) 356 if err != nil { 357 return false, errors.Wrap(err, "open checkpoint") 358 } 359 360 w.checkpointWAL = checkpoint 361 w.lastSegment = lastSegment 362 w.final = checkpointDir 363 364 return false, nil 365 } 366 367 // Buckets [64KB to 256MB] by 2 368 var recordBufferPool = prompool.New(1<<16, 1<<28, 2, func(size int) interface{} { return make([]byte, 0, size) }) 369 370 func (w *WALCheckpointWriter) Write(s *Series) error { 371 size := s.Size() + 1 // +1 for header 372 buf := recordBufferPool.Get(size).([]byte)[:size] 373 374 b, err := encodeWithTypeHeader(s, CheckpointRecord, buf) 375 if err != nil { 376 return err 377 } 378 379 w.recs = append(w.recs, b) 380 w.bufSize += len(b) 381 level.Debug(util_log.Logger).Log("msg", "writing series", "size", humanize.Bytes(uint64(len(b)))) 382 383 // 1MB 384 if w.bufSize > 1<<20 { 385 if err := w.flush(); err != nil { 386 return err 387 } 388 } 389 return nil 390 } 391 392 func (w *WALCheckpointWriter) flush() error { 393 level.Debug(util_log.Logger).Log("msg", "flushing series", "totalSize", humanize.Bytes(uint64(w.bufSize)), "series", len(w.recs)) 394 if err := w.checkpointWAL.Log(w.recs...); err != nil { 395 return err 396 } 397 w.metrics.checkpointLoggedBytesTotal.Add(float64(w.bufSize)) 398 for _, b := range w.recs { 399 recordBufferPool.Put(b) 400 } 401 w.recs = w.recs[:0] 402 w.bufSize = 0 403 return nil 404 } 405 406 const checkpointPrefix = "checkpoint." 407 408 var checkpointRe = regexp.MustCompile("^" + regexp.QuoteMeta(checkpointPrefix) + "(\\d+)(\\.tmp)?$") 409 410 // checkpointIndex returns the index of a given checkpoint file. It handles 411 // both regular and temporary checkpoints according to the includeTmp flag. If 412 // the file is not a checkpoint it returns an error. 413 func checkpointIndex(filename string, includeTmp bool) (int, error) { 414 result := checkpointRe.FindStringSubmatch(filename) 415 if len(result) < 2 { 416 return 0, errors.New("file is not a checkpoint") 417 } 418 // Filter out temporary checkpoints if desired. 419 if !includeTmp && len(result) == 3 && result[2] != "" { 420 return 0, errors.New("temporary checkpoint") 421 } 422 return strconv.Atoi(result[1]) 423 } 424 425 // lastCheckpoint returns the directory name and index of the most recent checkpoint. 426 // If dir does not contain any checkpoints, -1 is returned as index. 427 func lastCheckpoint(dir string) (string, int, error) { 428 dirs, err := ioutil.ReadDir(dir) 429 if err != nil { 430 return "", -1, err 431 } 432 var ( 433 maxIdx = -1 434 checkpointDir string 435 ) 436 // There may be multiple checkpoints left, so select the one with max index. 437 for i := 0; i < len(dirs); i++ { 438 di := dirs[i] 439 440 idx, err := checkpointIndex(di.Name(), false) 441 if err != nil { 442 continue 443 } 444 if !di.IsDir() { 445 return "", -1, fmt.Errorf("checkpoint %s is not a directory", di.Name()) 446 } 447 if idx > maxIdx { 448 checkpointDir = di.Name() 449 maxIdx = idx 450 } 451 } 452 if maxIdx >= 0 { 453 return filepath.Join(dir, checkpointDir), maxIdx, nil 454 } 455 return "", -1, nil 456 } 457 458 // deleteCheckpoints deletes all checkpoints in a directory which is < maxIndex. 459 func (w *WALCheckpointWriter) deleteCheckpoints(maxIndex int) (err error) { 460 w.metrics.checkpointDeleteTotal.Inc() 461 defer func() { 462 if err != nil { 463 w.metrics.checkpointDeleteFail.Inc() 464 } 465 }() 466 467 errs := tsdb_errors.NewMulti() 468 469 files, err := ioutil.ReadDir(w.segmentWAL.Dir()) 470 if err != nil { 471 return err 472 } 473 for _, fi := range files { 474 index, err := checkpointIndex(fi.Name(), true) 475 if err != nil || index >= maxIndex { 476 continue 477 } 478 if err := os.RemoveAll(filepath.Join(w.segmentWAL.Dir(), fi.Name())); err != nil { 479 errs.Add(err) 480 } 481 } 482 return errs.Err() 483 } 484 485 func (w *WALCheckpointWriter) Close(abort bool) error { 486 if len(w.recs) > 0 { 487 if err := w.flush(); err != nil { 488 return err 489 } 490 } 491 if err := w.checkpointWAL.Close(); err != nil { 492 return err 493 } 494 495 if abort { 496 return os.RemoveAll(w.checkpointWAL.Dir()) 497 } 498 499 if err := fileutil.Replace(w.checkpointWAL.Dir(), w.final); err != nil { 500 return errors.Wrap(err, "rename checkpoint directory") 501 } 502 level.Info(util_log.Logger).Log("msg", "atomic checkpoint finished", "old", w.checkpointWAL.Dir(), "new", w.final) 503 // We delete the WAL segments which are before the previous checkpoint and not before the 504 // current checkpoint created. This is because if the latest checkpoint is corrupted for any reason, we 505 // should be able to recover from the older checkpoint which would need the older WAL segments. 506 if err := w.segmentWAL.Truncate(w.lastSegment + 1); err != nil { 507 // It is fine to have old WAL segments hanging around if deletion failed. 508 // We can try again next time. 509 level.Error(util_log.Logger).Log("msg", "error deleting old WAL segments", "err", err, "lastSegment", w.lastSegment) 510 } 511 512 if w.lastSegment >= 0 { 513 if err := w.deleteCheckpoints(w.lastSegment); err != nil { 514 // It is fine to have old checkpoints hanging around if deletion failed. 515 // We can try again next time. 516 level.Error(util_log.Logger).Log("msg", "error deleting old checkpoint", "err", err) 517 } 518 } 519 520 return nil 521 } 522 523 type Checkpointer struct { 524 dur time.Duration 525 iter SeriesIter 526 writer CheckpointWriter 527 metrics *ingesterMetrics 528 529 quit <-chan struct{} 530 } 531 532 func NewCheckpointer(dur time.Duration, iter SeriesIter, writer CheckpointWriter, metrics *ingesterMetrics, quit <-chan struct{}) *Checkpointer { 533 return &Checkpointer{ 534 dur: dur, 535 iter: iter, 536 writer: writer, 537 metrics: metrics, 538 quit: quit, 539 } 540 } 541 542 func (c *Checkpointer) PerformCheckpoint() (err error) { 543 noop, err := c.writer.Advance() 544 if err != nil { 545 return err 546 } 547 if noop { 548 return nil 549 } 550 551 c.metrics.checkpointCreationTotal.Inc() 552 defer func() { 553 if err != nil { 554 c.metrics.checkpointCreationFail.Inc() 555 } 556 }() 557 // signal whether checkpoint writes should be amortized or burst 558 var immediate bool 559 n := c.iter.Count() 560 if n < 1 { 561 return c.writer.Close(false) 562 } 563 564 // Give a 10% buffer to the checkpoint duration in order to account for 565 // new series, slow writes, etc. 566 perSeriesDuration := (90 * c.dur) / (100 * time.Duration(n)) 567 568 ticker := time.NewTicker(perSeriesDuration) 569 defer ticker.Stop() 570 start := time.Now() 571 defer func() { 572 elapsed := time.Since(start) 573 level.Info(util_log.Logger).Log("msg", "checkpoint done", "time", elapsed.String()) 574 c.metrics.checkpointDuration.Observe(elapsed.Seconds()) 575 }() 576 577 iter := c.iter.Iter() 578 for iter.Next() { 579 if err := c.writer.Write(iter.Stream()); err != nil { 580 return err 581 } 582 583 if !immediate { 584 if time.Since(start) > c.dur { 585 // This indicates the checkpoint is taking too long; stop waiting 586 // and flush the remaining series as fast as possible. 587 immediate = true 588 continue 589 } 590 } 591 592 select { 593 case <-c.quit: 594 return c.writer.Close(true) 595 case <-ticker.C: 596 } 597 598 } 599 600 if iter.Error() != nil { 601 return iter.Error() 602 } 603 604 return c.writer.Close(false) 605 } 606 607 func (c *Checkpointer) Run() { 608 ticker := time.NewTicker(c.dur) 609 defer ticker.Stop() 610 defer c.iter.Stop() 611 612 for { 613 select { 614 case <-ticker.C: 615 level.Info(util_log.Logger).Log("msg", "starting checkpoint") 616 if err := c.PerformCheckpoint(); err != nil { 617 level.Error(util_log.Logger).Log("msg", "error checkpointing series", "err", err) 618 continue 619 } 620 case <-c.quit: 621 return 622 } 623 } 624 } 625 626 func unflushedChunks(descs []chunkDesc) []chunkDesc { 627 filtered := make([]chunkDesc, 0, len(descs)) 628 629 for _, d := range descs { 630 if d.flushed.IsZero() { 631 filtered = append(filtered, d) 632 } 633 } 634 635 return filtered 636 }