github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/clients/pkg/promtail/client/client.go (about) 1 package client 2 3 import ( 4 "bufio" 5 "bytes" 6 "context" 7 "crypto/sha256" 8 "errors" 9 "fmt" 10 "io" 11 "net/http" 12 "strconv" 13 "sync" 14 "time" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 "github.com/grafana/dskit/backoff" 19 "github.com/prometheus/client_golang/prometheus" 20 "github.com/prometheus/common/config" 21 "github.com/prometheus/common/model" 22 "github.com/prometheus/prometheus/promql/parser" 23 24 "github.com/grafana/loki/clients/pkg/promtail/api" 25 26 lokiutil "github.com/grafana/loki/pkg/util" 27 "github.com/grafana/loki/pkg/util/build" 28 ) 29 30 const ( 31 contentType = "application/x-protobuf" 32 maxErrMsgLen = 1024 33 34 // Label reserved to override the tenant ID while processing 35 // pipeline stages 36 ReservedLabelTenantID = "__tenant_id__" 37 38 LatencyLabel = "filename" 39 HostLabel = "host" 40 ClientLabel = "client" 41 ) 42 43 var UserAgent = fmt.Sprintf("promtail/%s", build.Version) 44 45 type Metrics struct { 46 encodedBytes *prometheus.CounterVec 47 sentBytes *prometheus.CounterVec 48 droppedBytes *prometheus.CounterVec 49 sentEntries *prometheus.CounterVec 50 droppedEntries *prometheus.CounterVec 51 requestDuration *prometheus.HistogramVec 52 batchRetries *prometheus.CounterVec 53 countersWithHost []*prometheus.CounterVec 54 streamLag *prometheus.GaugeVec 55 } 56 57 func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics { 58 var m Metrics 59 60 m.encodedBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ 61 Namespace: "promtail", 62 Name: "encoded_bytes_total", 63 Help: "Number of bytes encoded and ready to send.", 64 }, []string{HostLabel}) 65 m.sentBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ 66 Namespace: "promtail", 67 Name: "sent_bytes_total", 68 Help: "Number of bytes sent.", 69 }, []string{HostLabel}) 70 m.droppedBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ 71 Namespace: "promtail", 72 Name: "dropped_bytes_total", 73 Help: "Number of bytes dropped because failed to be sent to the ingester after all retries.", 74 }, []string{HostLabel}) 75 m.sentEntries = prometheus.NewCounterVec(prometheus.CounterOpts{ 76 Namespace: "promtail", 77 Name: "sent_entries_total", 78 Help: "Number of log entries sent to the ingester.", 79 }, []string{HostLabel}) 80 m.droppedEntries = prometheus.NewCounterVec(prometheus.CounterOpts{ 81 Namespace: "promtail", 82 Name: "dropped_entries_total", 83 Help: "Number of log entries dropped because failed to be sent to the ingester after all retries.", 84 }, []string{HostLabel}) 85 m.requestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 86 Namespace: "promtail", 87 Name: "request_duration_seconds", 88 Help: "Duration of send requests.", 89 }, []string{"status_code", HostLabel}) 90 m.batchRetries = prometheus.NewCounterVec(prometheus.CounterOpts{ 91 Namespace: "promtail", 92 Name: "batch_retries_total", 93 Help: "Number of times batches has had to be retried.", 94 }, []string{HostLabel}) 95 96 m.countersWithHost = []*prometheus.CounterVec{ 97 m.encodedBytes, m.sentBytes, m.droppedBytes, m.sentEntries, m.droppedEntries, 98 } 99 100 streamLagLabelsMerged := []string{HostLabel, ClientLabel} 101 streamLagLabelsMerged = append(streamLagLabelsMerged, streamLagLabels...) 102 m.streamLag = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 103 Namespace: "promtail", 104 Name: "stream_lag_seconds", 105 Help: "Difference between current time and last batch timestamp for successful sends", 106 }, streamLagLabelsMerged) 107 108 if reg != nil { 109 m.encodedBytes = mustRegisterOrGet(reg, m.encodedBytes).(*prometheus.CounterVec) 110 m.sentBytes = mustRegisterOrGet(reg, m.sentBytes).(*prometheus.CounterVec) 111 m.droppedBytes = mustRegisterOrGet(reg, m.droppedBytes).(*prometheus.CounterVec) 112 m.sentEntries = mustRegisterOrGet(reg, m.sentEntries).(*prometheus.CounterVec) 113 m.droppedEntries = mustRegisterOrGet(reg, m.droppedEntries).(*prometheus.CounterVec) 114 m.requestDuration = mustRegisterOrGet(reg, m.requestDuration).(*prometheus.HistogramVec) 115 m.batchRetries = mustRegisterOrGet(reg, m.batchRetries).(*prometheus.CounterVec) 116 m.streamLag = mustRegisterOrGet(reg, m.streamLag).(*prometheus.GaugeVec) 117 } 118 119 return &m 120 } 121 122 func mustRegisterOrGet(reg prometheus.Registerer, c prometheus.Collector) prometheus.Collector { 123 if err := reg.Register(c); err != nil { 124 if are, ok := err.(prometheus.AlreadyRegisteredError); ok { 125 return are.ExistingCollector 126 } 127 panic(err) 128 } 129 return c 130 } 131 132 // Client pushes entries to Loki and can be stopped 133 type Client interface { 134 api.EntryHandler 135 // Stop goroutine sending batch of entries without retries. 136 StopNow() 137 Name() string 138 } 139 140 // Client for pushing logs in snappy-compressed protos over HTTP. 141 type client struct { 142 name string 143 metrics *Metrics 144 streamLagLabels []string 145 logger log.Logger 146 cfg Config 147 client *http.Client 148 entries chan api.Entry 149 150 once sync.Once 151 wg sync.WaitGroup 152 153 externalLabels model.LabelSet 154 155 // ctx is used in any upstream calls from the `client`. 156 ctx context.Context 157 cancel context.CancelFunc 158 } 159 160 // Tripperware can wrap a roundtripper. 161 type Tripperware func(http.RoundTripper) http.RoundTripper 162 163 // New makes a new Client. 164 func New(metrics *Metrics, cfg Config, streamLagLabels []string, logger log.Logger) (Client, error) { 165 if cfg.StreamLagLabels.String() != "" { 166 return nil, fmt.Errorf("client config stream_lag_labels is deprecated in favour of the config file options block field, and will be ignored: %+v", cfg.StreamLagLabels.String()) 167 } 168 return newClient(metrics, cfg, streamLagLabels, logger) 169 } 170 171 func newClient(metrics *Metrics, cfg Config, streamLagLabels []string, logger log.Logger) (*client, error) { 172 173 if cfg.URL.URL == nil { 174 return nil, errors.New("client needs target URL") 175 } 176 177 ctx, cancel := context.WithCancel(context.Background()) 178 179 c := &client{ 180 logger: log.With(logger, "component", "client", "host", cfg.URL.Host), 181 cfg: cfg, 182 entries: make(chan api.Entry), 183 metrics: metrics, 184 streamLagLabels: streamLagLabels, 185 name: asSha256(cfg), 186 187 externalLabels: cfg.ExternalLabels.LabelSet, 188 ctx: ctx, 189 cancel: cancel, 190 } 191 if cfg.Name != "" { 192 c.name = cfg.Name 193 } 194 195 err := cfg.Client.Validate() 196 if err != nil { 197 return nil, err 198 } 199 200 c.client, err = config.NewClientFromConfig(cfg.Client, "promtail", config.WithHTTP2Disabled()) 201 if err != nil { 202 return nil, err 203 } 204 205 c.client.Timeout = cfg.Timeout 206 207 // Initialize counters to 0 so the metrics are exported before the first 208 // occurrence of incrementing to avoid missing metrics. 209 for _, counter := range c.metrics.countersWithHost { 210 counter.WithLabelValues(c.cfg.URL.Host).Add(0) 211 } 212 213 c.wg.Add(1) 214 go c.run() 215 return c, nil 216 } 217 218 // NewWithTripperware creates a new Loki client with a custom tripperware. 219 func NewWithTripperware(metrics *Metrics, cfg Config, streamLagLabels []string, logger log.Logger, tp Tripperware) (Client, error) { 220 c, err := newClient(metrics, cfg, streamLagLabels, logger) 221 if err != nil { 222 return nil, err 223 } 224 225 if tp != nil { 226 c.client.Transport = tp(c.client.Transport) 227 } 228 229 return c, nil 230 } 231 232 func (c *client) run() { 233 batches := map[string]*batch{} 234 235 // Given the client handles multiple batches (1 per tenant) and each batch 236 // can be created at a different point in time, we look for batches whose 237 // max wait time has been reached every 10 times per BatchWait, so that the 238 // maximum delay we have sending batches is 10% of the max waiting time. 239 // We apply a cap of 10ms to the ticker, to avoid too frequent checks in 240 // case the BatchWait is very low. 241 minWaitCheckFrequency := 10 * time.Millisecond 242 maxWaitCheckFrequency := c.cfg.BatchWait / 10 243 if maxWaitCheckFrequency < minWaitCheckFrequency { 244 maxWaitCheckFrequency = minWaitCheckFrequency 245 } 246 247 maxWaitCheck := time.NewTicker(maxWaitCheckFrequency) 248 249 defer func() { 250 maxWaitCheck.Stop() 251 // Send all pending batches 252 for tenantID, batch := range batches { 253 c.sendBatch(tenantID, batch) 254 } 255 256 c.wg.Done() 257 }() 258 259 for { 260 select { 261 case e, ok := <-c.entries: 262 if !ok { 263 return 264 } 265 e, tenantID := c.processEntry(e) 266 batch, ok := batches[tenantID] 267 268 // If the batch doesn't exist yet, we create a new one with the entry 269 if !ok { 270 batches[tenantID] = newBatch(e) 271 break 272 } 273 274 // If adding the entry to the batch will increase the size over the max 275 // size allowed, we do send the current batch and then create a new one 276 if batch.sizeBytesAfter(e) > c.cfg.BatchSize { 277 c.sendBatch(tenantID, batch) 278 279 batches[tenantID] = newBatch(e) 280 break 281 } 282 283 // The max size of the batch isn't reached, so we can add the entry 284 batch.add(e) 285 286 case <-maxWaitCheck.C: 287 // Send all batches whose max wait time has been reached 288 for tenantID, batch := range batches { 289 if batch.age() < c.cfg.BatchWait { 290 continue 291 } 292 293 c.sendBatch(tenantID, batch) 294 delete(batches, tenantID) 295 } 296 } 297 } 298 } 299 300 func (c *client) Chan() chan<- api.Entry { 301 return c.entries 302 } 303 304 func asSha256(o interface{}) string { 305 h := sha256.New() 306 h.Write([]byte(fmt.Sprintf("%v", o))) 307 308 temp := fmt.Sprintf("%x", h.Sum(nil)) 309 return temp[:6] 310 } 311 312 func (c *client) sendBatch(tenantID string, batch *batch) { 313 buf, entriesCount, err := batch.encode() 314 if err != nil { 315 level.Error(c.logger).Log("msg", "error encoding batch", "error", err) 316 return 317 } 318 bufBytes := float64(len(buf)) 319 c.metrics.encodedBytes.WithLabelValues(c.cfg.URL.Host).Add(bufBytes) 320 321 backoff := backoff.New(c.ctx, c.cfg.BackoffConfig) 322 var status int 323 for { 324 start := time.Now() 325 // send uses `timeout` internally, so `context.Background` is good enough. 326 status, err = c.send(context.Background(), tenantID, buf) 327 328 c.metrics.requestDuration.WithLabelValues(strconv.Itoa(status), c.cfg.URL.Host).Observe(time.Since(start).Seconds()) 329 330 if err == nil { 331 c.metrics.sentBytes.WithLabelValues(c.cfg.URL.Host).Add(bufBytes) 332 c.metrics.sentEntries.WithLabelValues(c.cfg.URL.Host).Add(float64(entriesCount)) 333 for _, s := range batch.streams { 334 lbls, err := parser.ParseMetric(s.Labels) 335 if err != nil { 336 // is this possible? 337 level.Warn(c.logger).Log("msg", "error converting stream label string to label.Labels, cannot update lagging metric", "error", err) 338 return 339 } 340 lblSet := make(prometheus.Labels) 341 for _, lbl := range c.streamLagLabels { 342 // label from streamLagLabels may not be found but we still need an empty value 343 // so that the prometheus client library doesn't panic on inconsistent label cardinality 344 value := "" 345 for i := range lbls { 346 if lbls[i].Name == lbl { 347 value = lbls[i].Value 348 } 349 } 350 lblSet[lbl] = value 351 } 352 if lblSet != nil { 353 // always set host 354 lblSet[HostLabel] = c.cfg.URL.Host 355 // also set client name since if we have multiple promtail clients configured we will run into a 356 // duplicate metric collected with same labels error when trying to hit the /metrics endpoint 357 lblSet[ClientLabel] = c.name 358 c.metrics.streamLag.With(lblSet).Set(time.Since(s.Entries[len(s.Entries)-1].Timestamp).Seconds()) 359 } 360 } 361 return 362 } 363 364 // Only retry 429s, 500s and connection-level errors. 365 if status > 0 && status != 429 && status/100 != 5 { 366 break 367 } 368 369 level.Warn(c.logger).Log("msg", "error sending batch, will retry", "status", status, "error", err) 370 c.metrics.batchRetries.WithLabelValues(c.cfg.URL.Host).Inc() 371 backoff.Wait() 372 373 // Make sure it sends at least once before checking for retry. 374 if !backoff.Ongoing() { 375 break 376 } 377 } 378 379 if err != nil { 380 level.Error(c.logger).Log("msg", "final error sending batch", "status", status, "error", err) 381 c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host).Add(bufBytes) 382 c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host).Add(float64(entriesCount)) 383 } 384 } 385 386 func (c *client) send(ctx context.Context, tenantID string, buf []byte) (int, error) { 387 ctx, cancel := context.WithTimeout(ctx, c.cfg.Timeout) 388 defer cancel() 389 req, err := http.NewRequest("POST", c.cfg.URL.String(), bytes.NewReader(buf)) 390 if err != nil { 391 return -1, err 392 } 393 req = req.WithContext(ctx) 394 req.Header.Set("Content-Type", contentType) 395 req.Header.Set("User-Agent", UserAgent) 396 397 // If the tenant ID is not empty promtail is running in multi-tenant mode, so 398 // we should send it to Loki 399 if tenantID != "" { 400 req.Header.Set("X-Scope-OrgID", tenantID) 401 } 402 403 resp, err := c.client.Do(req) 404 if err != nil { 405 return -1, err 406 } 407 defer lokiutil.LogError("closing response body", resp.Body.Close) 408 409 if resp.StatusCode/100 != 2 { 410 scanner := bufio.NewScanner(io.LimitReader(resp.Body, maxErrMsgLen)) 411 line := "" 412 if scanner.Scan() { 413 line = scanner.Text() 414 } 415 err = fmt.Errorf("server returned HTTP status %s (%d): %s", resp.Status, resp.StatusCode, line) 416 } 417 return resp.StatusCode, err 418 } 419 420 func (c *client) getTenantID(labels model.LabelSet) string { 421 // Check if it has been overridden while processing the pipeline stages 422 if value, ok := labels[ReservedLabelTenantID]; ok { 423 return string(value) 424 } 425 426 // Check if has been specified in the config 427 if c.cfg.TenantID != "" { 428 return c.cfg.TenantID 429 } 430 431 // Defaults to an empty string, which means the X-Scope-OrgID header 432 // will not be sent 433 return "" 434 } 435 436 // Stop the client. 437 func (c *client) Stop() { 438 c.once.Do(func() { close(c.entries) }) 439 c.wg.Wait() 440 } 441 442 // StopNow stops the client without retries 443 func (c *client) StopNow() { 444 // cancel will stop retrying http requests. 445 c.cancel() 446 c.Stop() 447 } 448 449 func (c *client) processEntry(e api.Entry) (api.Entry, string) { 450 if len(c.externalLabels) > 0 { 451 e.Labels = c.externalLabels.Merge(e.Labels) 452 } 453 tenantID := c.getTenantID(e.Labels) 454 return e, tenantID 455 } 456 457 func (c *client) UnregisterLatencyMetric(labels prometheus.Labels) { 458 labels[HostLabel] = c.cfg.URL.Host 459 c.metrics.streamLag.Delete(labels) 460 } 461 462 func (c *client) Name() string { 463 return c.name 464 }