github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/clients/pkg/promtail/client/client.go (about)

     1  package client
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"context"
     7  	"crypto/sha256"
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"net/http"
    12  	"strconv"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/go-kit/log"
    17  	"github.com/go-kit/log/level"
    18  	"github.com/grafana/dskit/backoff"
    19  	"github.com/prometheus/client_golang/prometheus"
    20  	"github.com/prometheus/common/config"
    21  	"github.com/prometheus/common/model"
    22  	"github.com/prometheus/prometheus/promql/parser"
    23  
    24  	"github.com/grafana/loki/clients/pkg/promtail/api"
    25  
    26  	lokiutil "github.com/grafana/loki/pkg/util"
    27  	"github.com/grafana/loki/pkg/util/build"
    28  )
    29  
    30  const (
    31  	contentType  = "application/x-protobuf"
    32  	maxErrMsgLen = 1024
    33  
    34  	// Label reserved to override the tenant ID while processing
    35  	// pipeline stages
    36  	ReservedLabelTenantID = "__tenant_id__"
    37  
    38  	LatencyLabel = "filename"
    39  	HostLabel    = "host"
    40  	ClientLabel  = "client"
    41  )
    42  
    43  var UserAgent = fmt.Sprintf("promtail/%s", build.Version)
    44  
    45  type Metrics struct {
    46  	encodedBytes     *prometheus.CounterVec
    47  	sentBytes        *prometheus.CounterVec
    48  	droppedBytes     *prometheus.CounterVec
    49  	sentEntries      *prometheus.CounterVec
    50  	droppedEntries   *prometheus.CounterVec
    51  	requestDuration  *prometheus.HistogramVec
    52  	batchRetries     *prometheus.CounterVec
    53  	countersWithHost []*prometheus.CounterVec
    54  	streamLag        *prometheus.GaugeVec
    55  }
    56  
    57  func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics {
    58  	var m Metrics
    59  
    60  	m.encodedBytes = prometheus.NewCounterVec(prometheus.CounterOpts{
    61  		Namespace: "promtail",
    62  		Name:      "encoded_bytes_total",
    63  		Help:      "Number of bytes encoded and ready to send.",
    64  	}, []string{HostLabel})
    65  	m.sentBytes = prometheus.NewCounterVec(prometheus.CounterOpts{
    66  		Namespace: "promtail",
    67  		Name:      "sent_bytes_total",
    68  		Help:      "Number of bytes sent.",
    69  	}, []string{HostLabel})
    70  	m.droppedBytes = prometheus.NewCounterVec(prometheus.CounterOpts{
    71  		Namespace: "promtail",
    72  		Name:      "dropped_bytes_total",
    73  		Help:      "Number of bytes dropped because failed to be sent to the ingester after all retries.",
    74  	}, []string{HostLabel})
    75  	m.sentEntries = prometheus.NewCounterVec(prometheus.CounterOpts{
    76  		Namespace: "promtail",
    77  		Name:      "sent_entries_total",
    78  		Help:      "Number of log entries sent to the ingester.",
    79  	}, []string{HostLabel})
    80  	m.droppedEntries = prometheus.NewCounterVec(prometheus.CounterOpts{
    81  		Namespace: "promtail",
    82  		Name:      "dropped_entries_total",
    83  		Help:      "Number of log entries dropped because failed to be sent to the ingester after all retries.",
    84  	}, []string{HostLabel})
    85  	m.requestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
    86  		Namespace: "promtail",
    87  		Name:      "request_duration_seconds",
    88  		Help:      "Duration of send requests.",
    89  	}, []string{"status_code", HostLabel})
    90  	m.batchRetries = prometheus.NewCounterVec(prometheus.CounterOpts{
    91  		Namespace: "promtail",
    92  		Name:      "batch_retries_total",
    93  		Help:      "Number of times batches has had to be retried.",
    94  	}, []string{HostLabel})
    95  
    96  	m.countersWithHost = []*prometheus.CounterVec{
    97  		m.encodedBytes, m.sentBytes, m.droppedBytes, m.sentEntries, m.droppedEntries,
    98  	}
    99  
   100  	streamLagLabelsMerged := []string{HostLabel, ClientLabel}
   101  	streamLagLabelsMerged = append(streamLagLabelsMerged, streamLagLabels...)
   102  	m.streamLag = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   103  		Namespace: "promtail",
   104  		Name:      "stream_lag_seconds",
   105  		Help:      "Difference between current time and last batch timestamp for successful sends",
   106  	}, streamLagLabelsMerged)
   107  
   108  	if reg != nil {
   109  		m.encodedBytes = mustRegisterOrGet(reg, m.encodedBytes).(*prometheus.CounterVec)
   110  		m.sentBytes = mustRegisterOrGet(reg, m.sentBytes).(*prometheus.CounterVec)
   111  		m.droppedBytes = mustRegisterOrGet(reg, m.droppedBytes).(*prometheus.CounterVec)
   112  		m.sentEntries = mustRegisterOrGet(reg, m.sentEntries).(*prometheus.CounterVec)
   113  		m.droppedEntries = mustRegisterOrGet(reg, m.droppedEntries).(*prometheus.CounterVec)
   114  		m.requestDuration = mustRegisterOrGet(reg, m.requestDuration).(*prometheus.HistogramVec)
   115  		m.batchRetries = mustRegisterOrGet(reg, m.batchRetries).(*prometheus.CounterVec)
   116  		m.streamLag = mustRegisterOrGet(reg, m.streamLag).(*prometheus.GaugeVec)
   117  	}
   118  
   119  	return &m
   120  }
   121  
   122  func mustRegisterOrGet(reg prometheus.Registerer, c prometheus.Collector) prometheus.Collector {
   123  	if err := reg.Register(c); err != nil {
   124  		if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
   125  			return are.ExistingCollector
   126  		}
   127  		panic(err)
   128  	}
   129  	return c
   130  }
   131  
   132  // Client pushes entries to Loki and can be stopped
   133  type Client interface {
   134  	api.EntryHandler
   135  	// Stop goroutine sending batch of entries without retries.
   136  	StopNow()
   137  	Name() string
   138  }
   139  
   140  // Client for pushing logs in snappy-compressed protos over HTTP.
   141  type client struct {
   142  	name            string
   143  	metrics         *Metrics
   144  	streamLagLabels []string
   145  	logger          log.Logger
   146  	cfg             Config
   147  	client          *http.Client
   148  	entries         chan api.Entry
   149  
   150  	once sync.Once
   151  	wg   sync.WaitGroup
   152  
   153  	externalLabels model.LabelSet
   154  
   155  	// ctx is used in any upstream calls from the `client`.
   156  	ctx    context.Context
   157  	cancel context.CancelFunc
   158  }
   159  
   160  // Tripperware can wrap a roundtripper.
   161  type Tripperware func(http.RoundTripper) http.RoundTripper
   162  
   163  // New makes a new Client.
   164  func New(metrics *Metrics, cfg Config, streamLagLabels []string, logger log.Logger) (Client, error) {
   165  	if cfg.StreamLagLabels.String() != "" {
   166  		return nil, fmt.Errorf("client config stream_lag_labels is deprecated in favour of the config file options block field, and will be ignored: %+v", cfg.StreamLagLabels.String())
   167  	}
   168  	return newClient(metrics, cfg, streamLagLabels, logger)
   169  }
   170  
   171  func newClient(metrics *Metrics, cfg Config, streamLagLabels []string, logger log.Logger) (*client, error) {
   172  
   173  	if cfg.URL.URL == nil {
   174  		return nil, errors.New("client needs target URL")
   175  	}
   176  
   177  	ctx, cancel := context.WithCancel(context.Background())
   178  
   179  	c := &client{
   180  		logger:          log.With(logger, "component", "client", "host", cfg.URL.Host),
   181  		cfg:             cfg,
   182  		entries:         make(chan api.Entry),
   183  		metrics:         metrics,
   184  		streamLagLabels: streamLagLabels,
   185  		name:            asSha256(cfg),
   186  
   187  		externalLabels: cfg.ExternalLabels.LabelSet,
   188  		ctx:            ctx,
   189  		cancel:         cancel,
   190  	}
   191  	if cfg.Name != "" {
   192  		c.name = cfg.Name
   193  	}
   194  
   195  	err := cfg.Client.Validate()
   196  	if err != nil {
   197  		return nil, err
   198  	}
   199  
   200  	c.client, err = config.NewClientFromConfig(cfg.Client, "promtail", config.WithHTTP2Disabled())
   201  	if err != nil {
   202  		return nil, err
   203  	}
   204  
   205  	c.client.Timeout = cfg.Timeout
   206  
   207  	// Initialize counters to 0 so the metrics are exported before the first
   208  	// occurrence of incrementing to avoid missing metrics.
   209  	for _, counter := range c.metrics.countersWithHost {
   210  		counter.WithLabelValues(c.cfg.URL.Host).Add(0)
   211  	}
   212  
   213  	c.wg.Add(1)
   214  	go c.run()
   215  	return c, nil
   216  }
   217  
   218  // NewWithTripperware creates a new Loki client with a custom tripperware.
   219  func NewWithTripperware(metrics *Metrics, cfg Config, streamLagLabels []string, logger log.Logger, tp Tripperware) (Client, error) {
   220  	c, err := newClient(metrics, cfg, streamLagLabels, logger)
   221  	if err != nil {
   222  		return nil, err
   223  	}
   224  
   225  	if tp != nil {
   226  		c.client.Transport = tp(c.client.Transport)
   227  	}
   228  
   229  	return c, nil
   230  }
   231  
   232  func (c *client) run() {
   233  	batches := map[string]*batch{}
   234  
   235  	// Given the client handles multiple batches (1 per tenant) and each batch
   236  	// can be created at a different point in time, we look for batches whose
   237  	// max wait time has been reached every 10 times per BatchWait, so that the
   238  	// maximum delay we have sending batches is 10% of the max waiting time.
   239  	// We apply a cap of 10ms to the ticker, to avoid too frequent checks in
   240  	// case the BatchWait is very low.
   241  	minWaitCheckFrequency := 10 * time.Millisecond
   242  	maxWaitCheckFrequency := c.cfg.BatchWait / 10
   243  	if maxWaitCheckFrequency < minWaitCheckFrequency {
   244  		maxWaitCheckFrequency = minWaitCheckFrequency
   245  	}
   246  
   247  	maxWaitCheck := time.NewTicker(maxWaitCheckFrequency)
   248  
   249  	defer func() {
   250  		maxWaitCheck.Stop()
   251  		// Send all pending batches
   252  		for tenantID, batch := range batches {
   253  			c.sendBatch(tenantID, batch)
   254  		}
   255  
   256  		c.wg.Done()
   257  	}()
   258  
   259  	for {
   260  		select {
   261  		case e, ok := <-c.entries:
   262  			if !ok {
   263  				return
   264  			}
   265  			e, tenantID := c.processEntry(e)
   266  			batch, ok := batches[tenantID]
   267  
   268  			// If the batch doesn't exist yet, we create a new one with the entry
   269  			if !ok {
   270  				batches[tenantID] = newBatch(e)
   271  				break
   272  			}
   273  
   274  			// If adding the entry to the batch will increase the size over the max
   275  			// size allowed, we do send the current batch and then create a new one
   276  			if batch.sizeBytesAfter(e) > c.cfg.BatchSize {
   277  				c.sendBatch(tenantID, batch)
   278  
   279  				batches[tenantID] = newBatch(e)
   280  				break
   281  			}
   282  
   283  			// The max size of the batch isn't reached, so we can add the entry
   284  			batch.add(e)
   285  
   286  		case <-maxWaitCheck.C:
   287  			// Send all batches whose max wait time has been reached
   288  			for tenantID, batch := range batches {
   289  				if batch.age() < c.cfg.BatchWait {
   290  					continue
   291  				}
   292  
   293  				c.sendBatch(tenantID, batch)
   294  				delete(batches, tenantID)
   295  			}
   296  		}
   297  	}
   298  }
   299  
   300  func (c *client) Chan() chan<- api.Entry {
   301  	return c.entries
   302  }
   303  
   304  func asSha256(o interface{}) string {
   305  	h := sha256.New()
   306  	h.Write([]byte(fmt.Sprintf("%v", o)))
   307  
   308  	temp := fmt.Sprintf("%x", h.Sum(nil))
   309  	return temp[:6]
   310  }
   311  
   312  func (c *client) sendBatch(tenantID string, batch *batch) {
   313  	buf, entriesCount, err := batch.encode()
   314  	if err != nil {
   315  		level.Error(c.logger).Log("msg", "error encoding batch", "error", err)
   316  		return
   317  	}
   318  	bufBytes := float64(len(buf))
   319  	c.metrics.encodedBytes.WithLabelValues(c.cfg.URL.Host).Add(bufBytes)
   320  
   321  	backoff := backoff.New(c.ctx, c.cfg.BackoffConfig)
   322  	var status int
   323  	for {
   324  		start := time.Now()
   325  		// send uses `timeout` internally, so `context.Background` is good enough.
   326  		status, err = c.send(context.Background(), tenantID, buf)
   327  
   328  		c.metrics.requestDuration.WithLabelValues(strconv.Itoa(status), c.cfg.URL.Host).Observe(time.Since(start).Seconds())
   329  
   330  		if err == nil {
   331  			c.metrics.sentBytes.WithLabelValues(c.cfg.URL.Host).Add(bufBytes)
   332  			c.metrics.sentEntries.WithLabelValues(c.cfg.URL.Host).Add(float64(entriesCount))
   333  			for _, s := range batch.streams {
   334  				lbls, err := parser.ParseMetric(s.Labels)
   335  				if err != nil {
   336  					// is this possible?
   337  					level.Warn(c.logger).Log("msg", "error converting stream label string to label.Labels, cannot update lagging metric", "error", err)
   338  					return
   339  				}
   340  				lblSet := make(prometheus.Labels)
   341  				for _, lbl := range c.streamLagLabels {
   342  					// label from streamLagLabels may not be found but we still need an empty value
   343  					// so that the prometheus client library doesn't panic on inconsistent label cardinality
   344  					value := ""
   345  					for i := range lbls {
   346  						if lbls[i].Name == lbl {
   347  							value = lbls[i].Value
   348  						}
   349  					}
   350  					lblSet[lbl] = value
   351  				}
   352  				if lblSet != nil {
   353  					// always set host
   354  					lblSet[HostLabel] = c.cfg.URL.Host
   355  					// also set client name since if we have multiple promtail clients configured we will run into a
   356  					// duplicate metric collected with same labels error when trying to hit the /metrics endpoint
   357  					lblSet[ClientLabel] = c.name
   358  					c.metrics.streamLag.With(lblSet).Set(time.Since(s.Entries[len(s.Entries)-1].Timestamp).Seconds())
   359  				}
   360  			}
   361  			return
   362  		}
   363  
   364  		// Only retry 429s, 500s and connection-level errors.
   365  		if status > 0 && status != 429 && status/100 != 5 {
   366  			break
   367  		}
   368  
   369  		level.Warn(c.logger).Log("msg", "error sending batch, will retry", "status", status, "error", err)
   370  		c.metrics.batchRetries.WithLabelValues(c.cfg.URL.Host).Inc()
   371  		backoff.Wait()
   372  
   373  		// Make sure it sends at least once before checking for retry.
   374  		if !backoff.Ongoing() {
   375  			break
   376  		}
   377  	}
   378  
   379  	if err != nil {
   380  		level.Error(c.logger).Log("msg", "final error sending batch", "status", status, "error", err)
   381  		c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host).Add(bufBytes)
   382  		c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host).Add(float64(entriesCount))
   383  	}
   384  }
   385  
   386  func (c *client) send(ctx context.Context, tenantID string, buf []byte) (int, error) {
   387  	ctx, cancel := context.WithTimeout(ctx, c.cfg.Timeout)
   388  	defer cancel()
   389  	req, err := http.NewRequest("POST", c.cfg.URL.String(), bytes.NewReader(buf))
   390  	if err != nil {
   391  		return -1, err
   392  	}
   393  	req = req.WithContext(ctx)
   394  	req.Header.Set("Content-Type", contentType)
   395  	req.Header.Set("User-Agent", UserAgent)
   396  
   397  	// If the tenant ID is not empty promtail is running in multi-tenant mode, so
   398  	// we should send it to Loki
   399  	if tenantID != "" {
   400  		req.Header.Set("X-Scope-OrgID", tenantID)
   401  	}
   402  
   403  	resp, err := c.client.Do(req)
   404  	if err != nil {
   405  		return -1, err
   406  	}
   407  	defer lokiutil.LogError("closing response body", resp.Body.Close)
   408  
   409  	if resp.StatusCode/100 != 2 {
   410  		scanner := bufio.NewScanner(io.LimitReader(resp.Body, maxErrMsgLen))
   411  		line := ""
   412  		if scanner.Scan() {
   413  			line = scanner.Text()
   414  		}
   415  		err = fmt.Errorf("server returned HTTP status %s (%d): %s", resp.Status, resp.StatusCode, line)
   416  	}
   417  	return resp.StatusCode, err
   418  }
   419  
   420  func (c *client) getTenantID(labels model.LabelSet) string {
   421  	// Check if it has been overridden while processing the pipeline stages
   422  	if value, ok := labels[ReservedLabelTenantID]; ok {
   423  		return string(value)
   424  	}
   425  
   426  	// Check if has been specified in the config
   427  	if c.cfg.TenantID != "" {
   428  		return c.cfg.TenantID
   429  	}
   430  
   431  	// Defaults to an empty string, which means the X-Scope-OrgID header
   432  	// will not be sent
   433  	return ""
   434  }
   435  
   436  // Stop the client.
   437  func (c *client) Stop() {
   438  	c.once.Do(func() { close(c.entries) })
   439  	c.wg.Wait()
   440  }
   441  
   442  // StopNow stops the client without retries
   443  func (c *client) StopNow() {
   444  	// cancel will stop retrying http requests.
   445  	c.cancel()
   446  	c.Stop()
   447  }
   448  
   449  func (c *client) processEntry(e api.Entry) (api.Entry, string) {
   450  	if len(c.externalLabels) > 0 {
   451  		e.Labels = c.externalLabels.Merge(e.Labels)
   452  	}
   453  	tenantID := c.getTenantID(e.Labels)
   454  	return e, tenantID
   455  }
   456  
   457  func (c *client) UnregisterLatencyMetric(labels prometheus.Labels) {
   458  	labels[HostLabel] = c.cfg.URL.Host
   459  	c.metrics.streamLag.Delete(labels)
   460  }
   461  
   462  func (c *client) Name() string {
   463  	return c.name
   464  }