github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/querier/tail.go (about) 1 package querier 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/go-kit/log/level" 10 "github.com/pkg/errors" 11 12 "github.com/grafana/loki/pkg/iter" 13 loghttp "github.com/grafana/loki/pkg/loghttp/legacy" 14 "github.com/grafana/loki/pkg/logproto" 15 util_log "github.com/grafana/loki/pkg/util/log" 16 ) 17 18 const ( 19 // keep checking connections with ingesters in duration 20 checkConnectionsWithIngestersPeriod = time.Second * 5 21 22 // the size of the channel buffer used to send tailing streams 23 // back to the requesting client 24 maxBufferedTailResponses = 10 25 26 // the maximum number of entries to return in a TailResponse 27 maxEntriesPerTailResponse = 100 28 29 // the maximum number of dropped entries to keep in memory that will be sent along 30 // with the next successfully pushed response. Once the dropped entries memory buffer 31 // exceed this value, we start skipping dropped entries too. 32 maxDroppedEntriesPerTailResponse = 1000 33 ) 34 35 // Tailer manages complete lifecycle of a tail request 36 type Tailer struct { 37 // openStreamIterator is for streams already open 38 openStreamIterator iter.HeapIterator 39 streamMtx sync.Mutex // for synchronizing access to openStreamIterator 40 41 currEntry logproto.Entry 42 currLabels string 43 44 // keep track of the streams for metrics about active streams 45 seenStreams map[uint64]struct{} 46 seenStreamsMtx sync.Mutex 47 48 tailDisconnectedIngesters func([]string) (map[string]logproto.Querier_TailClient, error) 49 50 querierTailClients map[string]logproto.Querier_TailClient // addr -> grpc clients for tailing logs from ingesters 51 querierTailClientsMtx sync.RWMutex 52 53 stopped bool 54 delayFor time.Duration 55 responseChan chan *loghttp.TailResponse 56 closeErrChan chan error 57 tailMaxDuration time.Duration 58 59 // if we are not seeing any response from ingester, 60 // how long do we want to wait by going into sleep 61 waitEntryThrottle time.Duration 62 metrics *Metrics 63 } 64 65 func (t *Tailer) readTailClients() { 66 t.querierTailClientsMtx.RLock() 67 defer t.querierTailClientsMtx.RUnlock() 68 69 for addr, querierTailClient := range t.querierTailClients { 70 go t.readTailClient(addr, querierTailClient) 71 } 72 } 73 74 // keeps sending oldest entry to responseChan. If channel is blocked drop the entry 75 // When channel is unblocked, send details of dropped entries with current entry 76 func (t *Tailer) loop() { 77 checkConnectionTicker := time.NewTicker(checkConnectionsWithIngestersPeriod) 78 defer checkConnectionTicker.Stop() 79 80 tailMaxDurationTicker := time.NewTicker(t.tailMaxDuration) 81 defer tailMaxDurationTicker.Stop() 82 83 droppedEntries := make([]loghttp.DroppedEntry, 0) 84 85 for !t.stopped { 86 select { 87 case <-checkConnectionTicker.C: 88 // Try to reconnect dropped ingesters and connect to new ingesters 89 if err := t.checkIngesterConnections(); err != nil { 90 level.Error(util_log.Logger).Log("msg", "Error reconnecting to disconnected ingesters", "err", err) 91 } 92 case <-tailMaxDurationTicker.C: 93 if err := t.close(); err != nil { 94 level.Error(util_log.Logger).Log("msg", "Error closing Tailer", "err", err) 95 } 96 t.closeErrChan <- errors.New("reached tail max duration limit") 97 return 98 default: 99 } 100 101 // Read as much entries as we can (up to the max allowed) and populate the 102 // tail response we'll send over the response channel 103 var ( 104 tailResponse = new(loghttp.TailResponse) 105 entriesCount = 0 106 entriesSize = 0 107 ) 108 109 for ; entriesCount < maxEntriesPerTailResponse && t.next(); entriesCount++ { 110 // If the response channel channel is blocked, we drop the current entry directly 111 // to save the effort 112 if t.isResponseChanBlocked() { 113 droppedEntries = dropEntry(droppedEntries, t.currEntry.Timestamp, t.currLabels) 114 continue 115 } 116 117 entriesSize += len(t.currEntry.Line) 118 tailResponse.Streams = append(tailResponse.Streams, logproto.Stream{ 119 Labels: t.currLabels, 120 Entries: []logproto.Entry{t.currEntry}, 121 }) 122 } 123 124 // If all consumed entries have been dropped because the response channel is blocked 125 // we should reiterate on the loop 126 if len(tailResponse.Streams) == 0 && entriesCount > 0 { 127 continue 128 } 129 130 // If no entry has been consumed we should ensure it's not caused by all ingesters 131 // connections dropped and then throttle for a while 132 if len(tailResponse.Streams) == 0 { 133 t.querierTailClientsMtx.RLock() 134 numClients := len(t.querierTailClients) 135 t.querierTailClientsMtx.RUnlock() 136 137 if numClients == 0 { 138 // All the connections to ingesters are dropped, try reconnecting or return error 139 if err := t.checkIngesterConnections(); err != nil { 140 level.Error(util_log.Logger).Log("msg", "Error reconnecting to ingesters", "err", err) 141 } else { 142 continue 143 } 144 if err := t.close(); err != nil { 145 level.Error(util_log.Logger).Log("msg", "Error closing Tailer", "err", err) 146 } 147 t.closeErrChan <- errors.New("all ingesters closed the connection") 148 return 149 } 150 151 time.Sleep(t.waitEntryThrottle) 152 continue 153 } 154 155 // Send the tail response through the response channel without blocking. 156 // Drop the entry if the response channel buffer is full. 157 if len(droppedEntries) > 0 { 158 tailResponse.DroppedEntries = droppedEntries 159 } 160 161 select { 162 case t.responseChan <- tailResponse: 163 t.metrics.tailedBytesTotal.Add(float64(entriesSize)) 164 if len(droppedEntries) > 0 { 165 droppedEntries = make([]loghttp.DroppedEntry, 0) 166 } 167 default: 168 droppedEntries = dropEntries(droppedEntries, tailResponse.Streams) 169 } 170 } 171 } 172 173 // Checks whether we are connected to all the ingesters to tail the logs. 174 // Helps in connecting to disconnected ingesters or connecting to new ingesters 175 func (t *Tailer) checkIngesterConnections() error { 176 t.querierTailClientsMtx.Lock() 177 defer t.querierTailClientsMtx.Unlock() 178 179 connectedIngestersAddr := make([]string, 0, len(t.querierTailClients)) 180 for addr := range t.querierTailClients { 181 connectedIngestersAddr = append(connectedIngestersAddr, addr) 182 } 183 184 newConnections, err := t.tailDisconnectedIngesters(connectedIngestersAddr) 185 if err != nil { 186 return fmt.Errorf("failed to connect with one or more ingester(s) during tailing: %w", err) 187 } 188 189 if len(newConnections) != 0 { 190 for addr, tailClient := range newConnections { 191 t.querierTailClients[addr] = tailClient 192 go t.readTailClient(addr, tailClient) 193 } 194 } 195 return nil 196 } 197 198 // removes disconnected tail client from map 199 func (t *Tailer) dropTailClient(addr string) { 200 t.querierTailClientsMtx.Lock() 201 defer t.querierTailClientsMtx.Unlock() 202 203 delete(t.querierTailClients, addr) 204 } 205 206 // keeps reading streams from grpc connection with ingesters 207 func (t *Tailer) readTailClient(addr string, querierTailClient logproto.Querier_TailClient) { 208 var resp *logproto.TailResponse 209 var err error 210 defer t.dropTailClient(addr) 211 212 logger := util_log.WithContext(querierTailClient.Context(), util_log.Logger) 213 for { 214 if t.stopped { 215 if err := querierTailClient.CloseSend(); err != nil { 216 level.Error(logger).Log("msg", "Error closing grpc tail client", "err", err) 217 } 218 break 219 } 220 resp, err = querierTailClient.Recv() 221 if err != nil { 222 // We don't want to log error when its due to stopping the tail request 223 if !t.stopped { 224 level.Error(logger).Log("msg", "Error receiving response from grpc tail client", "err", err) 225 } 226 break 227 } 228 t.pushTailResponseFromIngester(resp) 229 } 230 } 231 232 // pushes new streams from ingesters synchronously 233 func (t *Tailer) pushTailResponseFromIngester(resp *logproto.TailResponse) { 234 t.streamMtx.Lock() 235 defer t.streamMtx.Unlock() 236 237 t.openStreamIterator.Push(iter.NewStreamIterator(*resp.Stream)) 238 } 239 240 // finds oldest entry by peeking at open stream iterator. 241 // Response from ingester is pushed to open stream for further processing 242 func (t *Tailer) next() bool { 243 t.streamMtx.Lock() 244 defer t.streamMtx.Unlock() 245 246 if t.openStreamIterator.Len() == 0 || !time.Now().After(t.openStreamIterator.Peek().Add(t.delayFor)) || !t.openStreamIterator.Next() { 247 return false 248 } 249 250 t.currEntry = t.openStreamIterator.Entry() 251 t.currLabels = t.openStreamIterator.Labels() 252 t.recordStream(t.openStreamIterator.StreamHash()) 253 254 return true 255 } 256 257 func (t *Tailer) close() error { 258 t.streamMtx.Lock() 259 defer t.streamMtx.Unlock() 260 261 t.metrics.tailsActive.Dec() 262 t.metrics.tailedStreamsActive.Sub(t.activeStreamCount()) 263 264 t.stopped = true 265 return t.openStreamIterator.Close() 266 } 267 268 func (t *Tailer) isResponseChanBlocked() bool { 269 // Thread-safety: len() and cap() on a channel are thread-safe. The cap() doesn't 270 // change over the time, while len() does. 271 return len(t.responseChan) == cap(t.responseChan) 272 } 273 274 func (t *Tailer) getResponseChan() <-chan *loghttp.TailResponse { 275 return t.responseChan 276 } 277 278 func (t *Tailer) getCloseErrorChan() <-chan error { 279 return t.closeErrChan 280 } 281 282 func (t *Tailer) recordStream(id uint64) { 283 t.seenStreamsMtx.Lock() 284 defer t.seenStreamsMtx.Unlock() 285 286 if _, ok := t.seenStreams[id]; ok { 287 return 288 } 289 290 t.seenStreams[id] = struct{}{} 291 t.metrics.tailedStreamsActive.Inc() 292 } 293 294 func (t *Tailer) activeStreamCount() float64 { 295 t.seenStreamsMtx.Lock() 296 defer t.seenStreamsMtx.Unlock() 297 298 return float64(len(t.seenStreams)) 299 } 300 301 func newTailer( 302 delayFor time.Duration, 303 querierTailClients map[string]logproto.Querier_TailClient, 304 historicEntries iter.EntryIterator, 305 tailDisconnectedIngesters func([]string) (map[string]logproto.Querier_TailClient, error), 306 tailMaxDuration time.Duration, 307 waitEntryThrottle time.Duration, 308 m *Metrics, 309 ) *Tailer { 310 t := Tailer{ 311 openStreamIterator: iter.NewMergeEntryIterator(context.Background(), []iter.EntryIterator{historicEntries}, logproto.FORWARD), 312 querierTailClients: querierTailClients, 313 delayFor: delayFor, 314 responseChan: make(chan *loghttp.TailResponse, maxBufferedTailResponses), 315 closeErrChan: make(chan error), 316 seenStreams: make(map[uint64]struct{}), 317 tailDisconnectedIngesters: tailDisconnectedIngesters, 318 tailMaxDuration: tailMaxDuration, 319 waitEntryThrottle: waitEntryThrottle, 320 metrics: m, 321 } 322 323 t.metrics.tailsActive.Inc() 324 t.readTailClients() 325 go t.loop() 326 return &t 327 } 328 329 func dropEntry(droppedEntries []loghttp.DroppedEntry, timestamp time.Time, labels string) []loghttp.DroppedEntry { 330 if len(droppedEntries) >= maxDroppedEntriesPerTailResponse { 331 return droppedEntries 332 } 333 334 return append(droppedEntries, loghttp.DroppedEntry{Timestamp: timestamp, Labels: labels}) 335 } 336 337 func dropEntries(droppedEntries []loghttp.DroppedEntry, streams []logproto.Stream) []loghttp.DroppedEntry { 338 for _, stream := range streams { 339 for _, entry := range stream.Entries { 340 droppedEntries = dropEntry(droppedEntries, entry.Timestamp, entry.Line) 341 } 342 } 343 344 return droppedEntries 345 }