github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ingester/recovery.go (about) 1 package ingester 2 3 import ( 4 io "io" 5 "runtime" 6 "sync" 7 8 "github.com/go-kit/log/level" 9 "github.com/pkg/errors" 10 "github.com/prometheus/prometheus/tsdb/chunks" 11 "github.com/prometheus/prometheus/tsdb/record" 12 "github.com/prometheus/prometheus/tsdb/wal" 13 "golang.org/x/net/context" 14 15 "github.com/grafana/loki/pkg/logproto" 16 util_log "github.com/grafana/loki/pkg/util/log" 17 ) 18 19 type WALReader interface { 20 Next() bool 21 Err() error 22 // Record should not be used across multiple calls to Next() 23 Record() []byte 24 } 25 26 type NoopWALReader struct{} 27 28 func (NoopWALReader) Next() bool { return false } 29 func (NoopWALReader) Err() error { return nil } 30 func (NoopWALReader) Record() []byte { return nil } 31 func (NoopWALReader) Close() error { return nil } 32 33 func newCheckpointReader(dir string) (WALReader, io.Closer, error) { 34 lastCheckpointDir, idx, err := lastCheckpoint(dir) 35 if err != nil { 36 return nil, nil, err 37 } 38 if idx < 0 { 39 level.Info(util_log.Logger).Log("msg", "no checkpoint found, treating as no-op") 40 var reader NoopWALReader 41 return reader, reader, nil 42 } 43 44 r, err := wal.NewSegmentsReader(lastCheckpointDir) 45 if err != nil { 46 return nil, nil, err 47 } 48 return wal.NewReader(r), r, nil 49 } 50 51 type Recoverer interface { 52 NumWorkers() int 53 Series(series *Series) error 54 SetStream(userID string, series record.RefSeries) error 55 Push(userID string, entries RefEntries) error 56 Done() <-chan struct{} 57 } 58 59 type ingesterRecoverer struct { 60 // basically map[userID]map[fingerprint]*stream 61 users sync.Map 62 ing *Ingester 63 64 done chan struct{} 65 } 66 67 func newIngesterRecoverer(i *Ingester) *ingesterRecoverer { 68 69 return &ingesterRecoverer{ 70 ing: i, 71 done: make(chan struct{}), 72 } 73 } 74 75 // Use all available cores 76 func (r *ingesterRecoverer) NumWorkers() int { return runtime.GOMAXPROCS(0) } 77 78 func (r *ingesterRecoverer) Series(series *Series) error { 79 return r.ing.replayController.WithBackPressure(func() error { 80 81 inst, err := r.ing.GetOrCreateInstance(series.UserID) 82 if err != nil { 83 return err 84 } 85 86 // TODO(owen-d): create another fn to avoid unnecessary label type conversions. 87 stream, err := inst.getOrCreateStream(logproto.Stream{ 88 Labels: logproto.FromLabelAdaptersToLabels(series.Labels).String(), 89 }, nil) 90 91 if err != nil { 92 return err 93 } 94 95 bytesAdded, entriesAdded, err := stream.setChunks(series.Chunks) 96 stream.lastLine.ts = series.To 97 stream.lastLine.content = series.LastLine 98 stream.entryCt = series.EntryCt 99 stream.highestTs = series.HighestTs 100 101 if err != nil { 102 return err 103 } 104 r.ing.metrics.memoryChunks.Add(float64(len(series.Chunks))) 105 r.ing.metrics.recoveredChunksTotal.Add(float64(len(series.Chunks))) 106 r.ing.metrics.recoveredEntriesTotal.Add(float64(entriesAdded)) 107 r.ing.replayController.Add(int64(bytesAdded)) 108 109 // now store the stream in the recovery map under the fingerprint originally recorded 110 // as it's possible the newly mapped fingerprint is different. This is because the WAL records 111 // will use this original reference. 112 got, _ := r.users.LoadOrStore(series.UserID, &sync.Map{}) 113 streamsMap := got.(*sync.Map) 114 streamsMap.Store(chunks.HeadSeriesRef(series.Fingerprint), stream) 115 116 return nil 117 }) 118 } 119 120 // SetStream is responsible for setting the key path for userIDs -> fingerprints -> streams. 121 // Internally, this uses nested sync.Maps due to their performance benefits for sets that only grow. 122 // Using these also allows us to bypass the ingester -> instance -> stream hierarchy internally, which 123 // may yield some performance gains, but is essential for the following: 124 // Due to the use of the instance's fingerprint mapper, stream fingerprints are NOT necessarily 125 // deterministic. The WAL uses the post-mapped fingerprint on the ingester that originally 126 // created the stream and we ensure that said fingerprint maps correctly to the newly 127 // created stream during WAL replay, even if the new in memory stream was assigned a different 128 // fingerprint from the mapper. This is paramount because subsequent WAL records will use 129 // the fingerprint reported in the WAL record, not the potentially differing one assigned during 130 // stream creation. 131 func (r *ingesterRecoverer) SetStream(userID string, series record.RefSeries) error { 132 inst, err := r.ing.GetOrCreateInstance(userID) 133 if err != nil { 134 return err 135 } 136 137 stream, err := inst.getOrCreateStream( 138 logproto.Stream{ 139 Labels: series.Labels.String(), 140 }, 141 nil, 142 ) 143 if err != nil { 144 return err 145 } 146 147 // Now that we have the stream, ensure that the userID -> fingerprint -> stream 148 // path is set properly. 149 got, _ := r.users.LoadOrStore(userID, &sync.Map{}) 150 streamsMap := got.(*sync.Map) 151 streamsMap.Store(series.Ref, stream) 152 return nil 153 } 154 155 func (r *ingesterRecoverer) Push(userID string, entries RefEntries) error { 156 return r.ing.replayController.WithBackPressure(func() error { 157 out, ok := r.users.Load(userID) 158 if !ok { 159 return errors.Errorf("user (%s) not set during WAL replay", userID) 160 } 161 162 s, ok := out.(*sync.Map).Load(entries.Ref) 163 if !ok { 164 return errors.Errorf("stream (%d) not set during WAL replay for user (%s)", entries.Ref, userID) 165 } 166 167 // ignore out of order errors here (it's possible for a checkpoint to already have data from the wal segments) 168 bytesAdded, err := s.(*stream).Push(context.Background(), entries.Entries, nil, entries.Counter, true) 169 r.ing.replayController.Add(int64(bytesAdded)) 170 if err != nil && err == ErrEntriesExist { 171 r.ing.metrics.duplicateEntriesTotal.Add(float64(len(entries.Entries))) 172 } 173 return nil 174 }) 175 } 176 177 func (r *ingesterRecoverer) Close() { 178 // Ensure this is only run once. 179 select { 180 case <-r.done: 181 return 182 default: 183 } 184 185 close(r.done) 186 187 // Enable the limiter here to accurately reflect tenant limits after recovery. 188 r.ing.limiter.Enable() 189 190 for _, inst := range r.ing.getInstances() { 191 inst.forAllStreams(context.Background(), func(s *stream) error { 192 s.chunkMtx.Lock() 193 defer s.chunkMtx.Unlock() 194 195 // reset all the incrementing stream counters after a successful WAL replay. 196 s.resetCounter() 197 198 if len(s.chunks) == 0 { 199 inst.removeStream(s) 200 return nil 201 } 202 203 // If we've replayed a WAL with unordered writes, but the new 204 // configuration disables them, convert all streams/head blocks 205 // to ensure unordered writes are disabled after the replay, 206 // but without dropping any previously accepted data. 207 isAllowed := r.ing.limiter.UnorderedWrites(s.tenant) 208 old := s.unorderedWrites 209 s.unorderedWrites = isAllowed 210 211 if !isAllowed && old { 212 err := s.chunks[len(s.chunks)-1].chunk.ConvertHead(headBlockType(isAllowed)) 213 if err != nil { 214 level.Warn(util_log.Logger).Log( 215 "msg", "error converting headblock", 216 "err", err.Error(), 217 "stream", s.labels.String(), 218 "component", "ingesterRecoverer", 219 ) 220 } 221 } 222 223 return nil 224 }) 225 } 226 } 227 228 func (r *ingesterRecoverer) Done() <-chan struct{} { 229 return r.done 230 } 231 232 func RecoverWAL(reader WALReader, recoverer Recoverer) error { 233 dispatch := func(recoverer Recoverer, b []byte, inputs []chan recoveryInput) error { 234 rec := recordPool.GetRecord() 235 if err := decodeWALRecord(b, rec); err != nil { 236 return err 237 } 238 239 // First process all series to ensure we don't write entries to nonexistant series. 240 var firstErr error 241 for _, s := range rec.Series { 242 if err := recoverer.SetStream(rec.UserID, s); err != nil { 243 if firstErr == nil { 244 firstErr = err 245 } 246 } 247 248 } 249 250 for _, entries := range rec.RefEntries { 251 worker := int(uint64(entries.Ref) % uint64(len(inputs))) 252 inputs[worker] <- recoveryInput{ 253 userID: rec.UserID, 254 data: entries, 255 } 256 } 257 258 return firstErr 259 } 260 261 process := func(recoverer Recoverer, input <-chan recoveryInput, errCh chan<- error) { 262 for { 263 select { 264 case <-recoverer.Done(): 265 266 case next, ok := <-input: 267 if !ok { 268 return 269 } 270 entries, ok := next.data.(RefEntries) 271 var err error 272 if !ok { 273 err = errors.Errorf("unexpected type (%T) when recovering WAL, expecting (%T)", next.data, entries) 274 } 275 if err == nil { 276 err = recoverer.Push(next.userID, entries) 277 } 278 279 // Pass the error back, but respect the quit signal. 280 if err != nil { 281 errCh <- err 282 } 283 } 284 } 285 } 286 287 return recoverGeneric( 288 reader, 289 recoverer, 290 dispatch, 291 process, 292 ) 293 294 } 295 296 func RecoverCheckpoint(reader WALReader, recoverer Recoverer) error { 297 dispatch := func(recoverer Recoverer, b []byte, inputs []chan recoveryInput) error { 298 s := &Series{} 299 if err := decodeCheckpointRecord(b, s); err != nil { 300 return err 301 } 302 303 worker := int(s.Fingerprint % uint64(len(inputs))) 304 inputs[worker] <- recoveryInput{ 305 userID: s.UserID, 306 data: s, 307 } 308 return nil 309 } 310 311 process := func(recoverer Recoverer, input <-chan recoveryInput, errCh chan<- error) { 312 for { 313 select { 314 case <-recoverer.Done(): 315 316 case next, ok := <-input: 317 if !ok { 318 return 319 } 320 series, ok := next.data.(*Series) 321 var err error 322 if !ok { 323 err = errors.Errorf("unexpected type (%T) when recovering WAL, expecting (%T)", next.data, series) 324 } 325 if err == nil { 326 err = recoverer.Series(series) 327 } 328 329 if err != nil { 330 errCh <- err 331 } 332 } 333 } 334 } 335 336 return recoverGeneric( 337 reader, 338 recoverer, 339 dispatch, 340 process, 341 ) 342 } 343 344 type recoveryInput struct { 345 userID string 346 data interface{} 347 } 348 349 // recoverGeneric enables reusing the ability to recover from WALs of different types 350 // by exposing the dispatch and process functions. 351 // Note: it explicitly does not call the Recoverer.Close function as it's possible to layer 352 // multiple recoveries on top of each other, as in the case of recovering from Checkpoints 353 // then the WAL. 354 func recoverGeneric( 355 reader WALReader, 356 recoverer Recoverer, 357 dispatch func(Recoverer, []byte, []chan recoveryInput) error, 358 process func(Recoverer, <-chan recoveryInput, chan<- error), 359 ) error { 360 var wg sync.WaitGroup 361 var firstErr error 362 nWorkers := recoverer.NumWorkers() 363 364 if nWorkers < 1 { 365 return errors.New("cannot recover with no workers") 366 } 367 368 errCh := make(chan error) 369 inputs := make([]chan recoveryInput, 0, nWorkers) 370 wg.Add(nWorkers) 371 for i := 0; i < nWorkers; i++ { 372 inputs = append(inputs, make(chan recoveryInput)) 373 374 go func(input <-chan recoveryInput) { 375 defer wg.Done() 376 process(recoverer, input, errCh) 377 }(inputs[i]) 378 379 } 380 381 go func() { 382 for reader.Next() { 383 b := reader.Record() 384 if err := reader.Err(); err != nil { 385 errCh <- err 386 continue 387 } 388 389 if err := dispatch(recoverer, b, inputs); err != nil { 390 errCh <- err 391 continue 392 } 393 } 394 395 for _, w := range inputs { 396 close(w) 397 } 398 }() 399 400 finished := make(chan struct{}) 401 go func(finished chan<- struct{}) { 402 wg.Wait() 403 finished <- struct{}{} 404 }(finished) 405 406 for { 407 select { 408 case <-finished: 409 return firstErr 410 case err := <-errCh: 411 if firstErr == nil { 412 firstErr = err 413 } 414 } 415 } 416 }