github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/canary/reader/reader.go (about)

     1  package reader
     2  
     3  import (
     4  	"context"
     5  	"crypto/tls"
     6  	"encoding/base64"
     7  	"fmt"
     8  	"io"
     9  	"io/ioutil"
    10  	"log"
    11  	"net"
    12  	"net/http"
    13  	"net/url"
    14  	"strconv"
    15  	"strings"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/gorilla/websocket"
    20  	"github.com/grafana/dskit/backoff"
    21  	json "github.com/json-iterator/go"
    22  	"github.com/pkg/errors"
    23  	"github.com/prometheus/client_golang/prometheus"
    24  	"github.com/prometheus/client_golang/prometheus/promauto"
    25  	"github.com/prometheus/common/config"
    26  
    27  	"github.com/grafana/loki/pkg/loghttp"
    28  	"github.com/grafana/loki/pkg/logqlmodel"
    29  	"github.com/grafana/loki/pkg/util/build"
    30  	"github.com/grafana/loki/pkg/util/unmarshal"
    31  )
    32  
    33  var (
    34  	reconnects = promauto.NewCounter(prometheus.CounterOpts{
    35  		Namespace: "loki_canary",
    36  		Name:      "ws_reconnects_total",
    37  		Help:      "counts every time the websocket connection has to reconnect",
    38  	})
    39  	websocketPings = promauto.NewCounter(prometheus.CounterOpts{
    40  		Namespace: "loki_canary",
    41  		Name:      "ws_pings_total",
    42  		Help:      "counts every time the websocket receives a ping message",
    43  	})
    44  	userAgent = fmt.Sprintf("loki-canary/%s", build.Version)
    45  )
    46  
    47  type LokiReader interface {
    48  	Query(start time.Time, end time.Time) ([]time.Time, error)
    49  	QueryCountOverTime(queryRange string) (float64, error)
    50  }
    51  
    52  type Reader struct {
    53  	header          http.Header
    54  	useTLS          bool
    55  	clientTLSConfig *tls.Config
    56  	caFile          string
    57  	addr            string
    58  	user            string
    59  	pass            string
    60  	tenantID        string
    61  	httpClient      *http.Client
    62  	queryTimeout    time.Duration
    63  	sName           string
    64  	sValue          string
    65  	lName           string
    66  	lVal            string
    67  	backoff         *backoff.Backoff
    68  	nextQuery       time.Time
    69  	backoffMtx      sync.RWMutex
    70  	interval        time.Duration
    71  	conn            *websocket.Conn
    72  	w               io.Writer
    73  	recv            chan time.Time
    74  	quit            chan struct{}
    75  	shuttingDown    bool
    76  	done            chan struct{}
    77  }
    78  
    79  func NewReader(writer io.Writer,
    80  	receivedChan chan time.Time,
    81  	useTLS bool,
    82  	tlsConfig *tls.Config,
    83  	caFile string,
    84  	address string,
    85  	user string,
    86  	pass string,
    87  	tenantID string,
    88  	queryTimeout time.Duration,
    89  	labelName string,
    90  	labelVal string,
    91  	streamName string,
    92  	streamValue string,
    93  	interval time.Duration,
    94  ) (*Reader, error) {
    95  	h := http.Header{}
    96  
    97  	// http.DefaultClient will be used in the case that the connection to Loki is http or TLS without client certs.
    98  	httpClient := http.DefaultClient
    99  	if tlsConfig != nil {
   100  		// For the mTLS case, use a http.Client configured with the client side certificates.
   101  		rt, err := config.NewTLSRoundTripper(tlsConfig, caFile, func(tls *tls.Config) (http.RoundTripper, error) {
   102  			return &http.Transport{TLSClientConfig: tls}, nil
   103  		})
   104  		if err != nil {
   105  			return nil, errors.Wrapf(err, "Failed to create HTTPS transport with TLS config")
   106  		}
   107  		httpClient = &http.Client{Transport: rt}
   108  	}
   109  	if user != "" {
   110  		h.Set("Authorization", "Basic "+base64.StdEncoding.EncodeToString([]byte(user+":"+pass)))
   111  	}
   112  	if tenantID != "" {
   113  		h.Set("X-Scope-OrgID", tenantID)
   114  	}
   115  
   116  	next := time.Now()
   117  	bkcfg := backoff.Config{
   118  		MinBackoff: 1 * time.Second,
   119  		MaxBackoff: 10 * time.Minute,
   120  		MaxRetries: 0,
   121  	}
   122  	bkoff := backoff.New(context.Background(), bkcfg)
   123  
   124  	rd := Reader{
   125  		header:          h,
   126  		useTLS:          useTLS,
   127  		clientTLSConfig: tlsConfig,
   128  		caFile:          caFile,
   129  		addr:            address,
   130  		user:            user,
   131  		pass:            pass,
   132  		tenantID:        tenantID,
   133  		queryTimeout:    queryTimeout,
   134  		httpClient:      httpClient,
   135  		sName:           streamName,
   136  		sValue:          streamValue,
   137  		lName:           labelName,
   138  		lVal:            labelVal,
   139  		nextQuery:       next,
   140  		backoff:         bkoff,
   141  		interval:        interval,
   142  		w:               writer,
   143  		recv:            receivedChan,
   144  		quit:            make(chan struct{}),
   145  		done:            make(chan struct{}),
   146  		shuttingDown:    false,
   147  	}
   148  
   149  	go rd.run()
   150  
   151  	go func() {
   152  		<-rd.quit
   153  		if rd.conn != nil {
   154  			fmt.Fprintf(rd.w, "shutting down reader\n")
   155  			rd.shuttingDown = true
   156  			_ = rd.conn.Close()
   157  		}
   158  	}()
   159  
   160  	return &rd, nil
   161  }
   162  
   163  func (r *Reader) Stop() {
   164  	if r.quit != nil {
   165  		close(r.quit)
   166  		<-r.done
   167  		r.quit = nil
   168  	}
   169  }
   170  
   171  // QueryCountOverTime will ask Loki for a count of logs over the provided range e.g. 5m
   172  // QueryCountOverTime blocks if a previous query has failed until the appropriate backoff time has been reached.
   173  func (r *Reader) QueryCountOverTime(queryRange string) (float64, error) {
   174  	r.backoffMtx.RLock()
   175  	next := r.nextQuery
   176  	r.backoffMtx.RUnlock()
   177  	for time.Now().Before(next) {
   178  		time.Sleep(50 * time.Millisecond)
   179  		// Update next in case other queries have tried and failed
   180  		r.backoffMtx.RLock()
   181  		next = r.nextQuery
   182  		r.backoffMtx.RUnlock()
   183  	}
   184  
   185  	scheme := "http"
   186  	if r.useTLS {
   187  		scheme = "https"
   188  	}
   189  	u := url.URL{
   190  		Scheme: scheme,
   191  		Host:   r.addr,
   192  		Path:   "/loki/api/v1/query",
   193  		RawQuery: "query=" + url.QueryEscape(fmt.Sprintf("count_over_time({%v=\"%v\",%v=\"%v\"}[%s])", r.sName, r.sValue, r.lName, r.lVal, queryRange)) +
   194  			fmt.Sprintf("&time=%d", time.Now().UnixNano()) +
   195  			"&limit=1000",
   196  	}
   197  	fmt.Fprintf(r.w, "Querying loki for metric count with query: %v\n", u.String())
   198  
   199  	ctx, cancel := context.WithTimeout(context.Background(), r.queryTimeout)
   200  	defer cancel()
   201  
   202  	req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
   203  	if err != nil {
   204  		return 0, err
   205  	}
   206  
   207  	if r.user != "" {
   208  		req.SetBasicAuth(r.user, r.pass)
   209  	}
   210  	if r.tenantID != "" {
   211  		req.Header.Set("X-Scope-OrgID", r.tenantID)
   212  	}
   213  	req.Header.Set("User-Agent", userAgent)
   214  
   215  	resp, err := r.httpClient.Do(req)
   216  	if err != nil {
   217  		return 0, errors.Wrap(err, "query request failed")
   218  	}
   219  	defer func() {
   220  		if err := resp.Body.Close(); err != nil {
   221  			log.Println("error closing body", err)
   222  		}
   223  	}()
   224  
   225  	if resp.StatusCode/100 != 2 {
   226  		r.backoffMtx.Lock()
   227  		r.nextQuery = nextBackoff(r.w, resp.StatusCode, r.backoff)
   228  		r.backoffMtx.Unlock()
   229  		buf, _ := ioutil.ReadAll(resp.Body)
   230  		return 0, fmt.Errorf("error response from server: %s (%v)", string(buf), err)
   231  	}
   232  	// No Errors, reset backoff
   233  	r.backoffMtx.Lock()
   234  	r.backoff.Reset()
   235  	r.backoffMtx.Unlock()
   236  
   237  	var decoded loghttp.QueryResponse
   238  	err = json.NewDecoder(resp.Body).Decode(&decoded)
   239  	if err != nil {
   240  		return 0, err
   241  	}
   242  
   243  	value := decoded.Data.Result
   244  	ret := 0.0
   245  	switch value.Type() {
   246  	case loghttp.ResultTypeVector:
   247  		series := value.(loghttp.Vector)
   248  		if len(series) > 1 {
   249  			return 0, fmt.Errorf("expected only a single series result in the metric test query vector, instead received %v", len(series))
   250  		}
   251  		if len(series) == 0 {
   252  			return 0, fmt.Errorf("expected to receive one sample in the result vector, received 0")
   253  		}
   254  		ret = float64(series[0].Value)
   255  	default:
   256  		return 0, fmt.Errorf("unexpected result type, expected a Vector result instead received %v", value.Type())
   257  	}
   258  
   259  	return ret, nil
   260  }
   261  
   262  // Query will ask Loki for all canary timestamps in the requested timerange.
   263  // Query blocks if a previous query has failed until the appropriate backoff time has been reached.
   264  func (r *Reader) Query(start time.Time, end time.Time) ([]time.Time, error) {
   265  	r.backoffMtx.RLock()
   266  	next := r.nextQuery
   267  	r.backoffMtx.RUnlock()
   268  	for time.Now().Before(next) {
   269  		time.Sleep(50 * time.Millisecond)
   270  		// Update next in case other queries have tried and failed moving it even farther in the future
   271  		r.backoffMtx.RLock()
   272  		next = r.nextQuery
   273  		r.backoffMtx.RUnlock()
   274  	}
   275  
   276  	scheme := "http"
   277  	if r.useTLS {
   278  		scheme = "https"
   279  	}
   280  	u := url.URL{
   281  		Scheme: scheme,
   282  		Host:   r.addr,
   283  		Path:   "/loki/api/v1/query_range",
   284  		RawQuery: fmt.Sprintf("start=%d&end=%d", start.UnixNano(), end.UnixNano()) +
   285  			"&query=" + url.QueryEscape(fmt.Sprintf("{%v=\"%v\",%v=\"%v\"}", r.sName, r.sValue, r.lName, r.lVal)) +
   286  			"&limit=1000",
   287  	}
   288  	fmt.Fprintf(r.w, "Querying loki for logs with query: %v\n", u.String())
   289  
   290  	ctx, cancel := context.WithTimeout(context.Background(), r.queryTimeout)
   291  	defer cancel()
   292  
   293  	req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
   294  	if err != nil {
   295  		return nil, err
   296  	}
   297  
   298  	if r.user != "" {
   299  		req.SetBasicAuth(r.user, r.pass)
   300  	}
   301  	if r.tenantID != "" {
   302  		req.Header.Set("X-Scope-OrgID", r.tenantID)
   303  	}
   304  	req.Header.Set("User-Agent", userAgent)
   305  
   306  	resp, err := r.httpClient.Do(req)
   307  	if err != nil {
   308  		return nil, errors.Wrap(err, "query_range request failed")
   309  	}
   310  	defer func() {
   311  		if err := resp.Body.Close(); err != nil {
   312  			log.Println("error closing body", err)
   313  		}
   314  	}()
   315  
   316  	if resp.StatusCode/100 != 2 {
   317  		r.backoffMtx.Lock()
   318  		r.nextQuery = nextBackoff(r.w, resp.StatusCode, r.backoff)
   319  		r.backoffMtx.Unlock()
   320  		buf, _ := ioutil.ReadAll(resp.Body)
   321  		return nil, fmt.Errorf("error response from server: %s (%v)", string(buf), err)
   322  	}
   323  	// No Errors, reset backoff
   324  	r.backoffMtx.Lock()
   325  	r.backoff.Reset()
   326  	r.backoffMtx.Unlock()
   327  
   328  	var decoded loghttp.QueryResponse
   329  	err = json.NewDecoder(resp.Body).Decode(&decoded)
   330  	if err != nil {
   331  		return nil, err
   332  	}
   333  
   334  	tss := []time.Time{}
   335  	value := decoded.Data.Result
   336  	switch value.Type() {
   337  	case logqlmodel.ValueTypeStreams:
   338  		for _, stream := range value.(loghttp.Streams) {
   339  			for _, entry := range stream.Entries {
   340  				ts, err := parseResponse(&entry)
   341  				if err != nil {
   342  					fmt.Fprint(r.w, err)
   343  					continue
   344  				}
   345  				tss = append(tss, *ts)
   346  			}
   347  		}
   348  	default:
   349  		return nil, fmt.Errorf("unexpected result type, expected a log stream result instead received %v", value.Type())
   350  	}
   351  
   352  	return tss, nil
   353  }
   354  
   355  // run uses the established websocket connection to tail logs from Loki
   356  func (r *Reader) run() {
   357  	r.closeAndReconnect()
   358  
   359  	tailResponse := &loghttp.TailResponse{}
   360  	lastMessageTs := time.Now()
   361  
   362  	for {
   363  		if r.shuttingDown {
   364  			if r.conn != nil {
   365  				_ = r.conn.Close()
   366  				r.conn = nil
   367  			}
   368  			close(r.done)
   369  			return
   370  		}
   371  
   372  		// Set a read timeout of 10x the interval we expect to see messages
   373  		// Ignore the error as it will get caught when we call ReadJSON
   374  		timeoutInterval := 10 * r.interval
   375  		_ = r.conn.SetReadDeadline(time.Now().Add(timeoutInterval))
   376  		// I assume this is a blocking call that either reads from the websocket connection
   377  		// or times out based on the above SetReadDeadline call.
   378  		err := unmarshal.ReadTailResponseJSON(tailResponse, r.conn)
   379  		if err != nil {
   380  			reason := "error reading websocket"
   381  			if e, ok := err.(net.Error); ok && e.Timeout() {
   382  				reason = fmt.Sprintf("timeout tailing new logs (timeout period: %.2fs)", timeoutInterval.Seconds())
   383  			}
   384  
   385  			fmt.Fprintf(r.w, "%s, will retry in 10 seconds: %s\n", reason, err)
   386  
   387  			// Even though we sleep between connection retries, we found it's possible to DOS Loki if the connection
   388  			// succeeds but some other error is returned, so also sleep here before retrying.
   389  			<-time.After(10 * time.Second)
   390  			r.closeAndReconnect()
   391  			continue
   392  		}
   393  		// If there were streams update last message timestamp
   394  		if len(tailResponse.Streams) > 0 {
   395  			lastMessageTs = time.Now()
   396  		}
   397  		for _, stream := range tailResponse.Streams {
   398  			for _, entry := range stream.Entries {
   399  				ts, err := parseResponse(&entry)
   400  				if err != nil {
   401  					fmt.Fprint(r.w, err)
   402  					continue
   403  				}
   404  				r.recv <- *ts
   405  			}
   406  		}
   407  		// Ping messages can reset the read deadline so also make sure we are receiving regular messages.
   408  		// We use the same 10x interval to make sure we are getting recent messages
   409  		if time.Since(lastMessageTs).Milliseconds() > 10*r.interval.Milliseconds() {
   410  			fmt.Fprintf(r.w, "Have not received a canary message from loki on the websocket in %vms, "+
   411  				"sleeping 10s and reconnecting\n", 10*r.interval.Milliseconds())
   412  			<-time.After(10 * time.Second)
   413  			r.closeAndReconnect()
   414  			continue
   415  		}
   416  	}
   417  }
   418  
   419  // closeAndReconnect establishes a web socket connection to Loki and sets up a tail query
   420  // with the stream and label selectors which can be used check if all logs generated by the
   421  // canary have been received by Loki.
   422  func (r *Reader) closeAndReconnect() {
   423  	if r.shuttingDown {
   424  		return
   425  	}
   426  	if r.conn != nil {
   427  		_ = r.conn.Close()
   428  		r.conn = nil
   429  		// By incrementing reconnects here we should only count a failure followed by a successful reconnect.
   430  		// Initial connections and reconnections from failed tries will not be counted.
   431  		reconnects.Inc()
   432  	}
   433  	for r.conn == nil {
   434  		scheme := "ws"
   435  		if r.useTLS {
   436  			scheme = "wss"
   437  		}
   438  		u := url.URL{
   439  			Scheme:   scheme,
   440  			Host:     r.addr,
   441  			Path:     "/loki/api/v1/tail",
   442  			RawQuery: "query=" + url.QueryEscape(fmt.Sprintf("{%v=\"%v\",%v=\"%v\"}", r.sName, r.sValue, r.lName, r.lVal)),
   443  		}
   444  
   445  		fmt.Fprintf(r.w, "Connecting to loki at %v, querying for label '%v' with value '%v'\n", u.String(), r.lName, r.lVal)
   446  
   447  		dialer := r.webSocketDialer()
   448  		c, _, err := dialer.Dial(u.String(), r.header)
   449  		if err != nil {
   450  			fmt.Fprintf(r.w, "failed to connect to %s with err %s\n", u.String(), err)
   451  			<-time.After(10 * time.Second)
   452  			continue
   453  		}
   454  		// Use a custom ping handler so we can increment a metric, this is copied from the default ping handler
   455  		c.SetPingHandler(func(message string) error {
   456  			websocketPings.Inc()
   457  			err := c.WriteControl(websocket.PongMessage, []byte(message), time.Now().Add(time.Second))
   458  			if err == websocket.ErrCloseSent {
   459  				return nil
   460  			} else if e, ok := err.(net.Error); ok && e.Temporary() {
   461  				return nil
   462  			}
   463  			return err
   464  		})
   465  		r.conn = c
   466  	}
   467  }
   468  
   469  // webSocketDialer creates a dialer for the web socket connection to Loki
   470  // websocket.DefaultDialer will be returned in the case that the connection to Loki is http or TLS without client certs.
   471  // For the mTLS case, return a websocket.Dialer configured to use client side certificates.
   472  func (r *Reader) webSocketDialer() *websocket.Dialer {
   473  	return &websocket.Dialer{
   474  		Proxy:            http.ProxyFromEnvironment,
   475  		TLSClientConfig:  r.clientTLSConfig,
   476  		HandshakeTimeout: 45 * time.Second,
   477  	}
   478  }
   479  
   480  func parseResponse(entry *loghttp.Entry) (*time.Time, error) {
   481  	sp := strings.Split(entry.Line, " ")
   482  	if len(sp) != 2 {
   483  		return nil, errors.Errorf("received invalid entry: %s", entry.Line)
   484  	}
   485  	ts, err := strconv.ParseInt(sp[0], 10, 64)
   486  	if err != nil {
   487  		return nil, errors.Errorf("failed to parse timestamp: %s", sp[0])
   488  	}
   489  	t := time.Unix(0, ts)
   490  	return &t, nil
   491  }
   492  
   493  func nextBackoff(w io.Writer, statusCode int, backoff *backoff.Backoff) time.Time {
   494  	// Be way more conservative with an http 429 and wait 5 minutes before trying again.
   495  	var next time.Time
   496  	if statusCode == http.StatusTooManyRequests {
   497  		next = time.Now().Add(5 * time.Minute)
   498  	} else {
   499  		next = time.Now().Add(backoff.NextDelay())
   500  	}
   501  	fmt.Fprintf(w, "Loki returned an error code: %v, waiting %v before next query.", statusCode, time.Until(next))
   502  	return next
   503  }