github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/canary/reader/reader.go (about) 1 package reader 2 3 import ( 4 "context" 5 "crypto/tls" 6 "encoding/base64" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "log" 11 "net" 12 "net/http" 13 "net/url" 14 "strconv" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/gorilla/websocket" 20 "github.com/grafana/dskit/backoff" 21 json "github.com/json-iterator/go" 22 "github.com/pkg/errors" 23 "github.com/prometheus/client_golang/prometheus" 24 "github.com/prometheus/client_golang/prometheus/promauto" 25 "github.com/prometheus/common/config" 26 27 "github.com/grafana/loki/pkg/loghttp" 28 "github.com/grafana/loki/pkg/logqlmodel" 29 "github.com/grafana/loki/pkg/util/build" 30 "github.com/grafana/loki/pkg/util/unmarshal" 31 ) 32 33 var ( 34 reconnects = promauto.NewCounter(prometheus.CounterOpts{ 35 Namespace: "loki_canary", 36 Name: "ws_reconnects_total", 37 Help: "counts every time the websocket connection has to reconnect", 38 }) 39 websocketPings = promauto.NewCounter(prometheus.CounterOpts{ 40 Namespace: "loki_canary", 41 Name: "ws_pings_total", 42 Help: "counts every time the websocket receives a ping message", 43 }) 44 userAgent = fmt.Sprintf("loki-canary/%s", build.Version) 45 ) 46 47 type LokiReader interface { 48 Query(start time.Time, end time.Time) ([]time.Time, error) 49 QueryCountOverTime(queryRange string) (float64, error) 50 } 51 52 type Reader struct { 53 header http.Header 54 useTLS bool 55 clientTLSConfig *tls.Config 56 caFile string 57 addr string 58 user string 59 pass string 60 tenantID string 61 httpClient *http.Client 62 queryTimeout time.Duration 63 sName string 64 sValue string 65 lName string 66 lVal string 67 backoff *backoff.Backoff 68 nextQuery time.Time 69 backoffMtx sync.RWMutex 70 interval time.Duration 71 conn *websocket.Conn 72 w io.Writer 73 recv chan time.Time 74 quit chan struct{} 75 shuttingDown bool 76 done chan struct{} 77 } 78 79 func NewReader(writer io.Writer, 80 receivedChan chan time.Time, 81 useTLS bool, 82 tlsConfig *tls.Config, 83 caFile string, 84 address string, 85 user string, 86 pass string, 87 tenantID string, 88 queryTimeout time.Duration, 89 labelName string, 90 labelVal string, 91 streamName string, 92 streamValue string, 93 interval time.Duration, 94 ) (*Reader, error) { 95 h := http.Header{} 96 97 // http.DefaultClient will be used in the case that the connection to Loki is http or TLS without client certs. 98 httpClient := http.DefaultClient 99 if tlsConfig != nil { 100 // For the mTLS case, use a http.Client configured with the client side certificates. 101 rt, err := config.NewTLSRoundTripper(tlsConfig, caFile, func(tls *tls.Config) (http.RoundTripper, error) { 102 return &http.Transport{TLSClientConfig: tls}, nil 103 }) 104 if err != nil { 105 return nil, errors.Wrapf(err, "Failed to create HTTPS transport with TLS config") 106 } 107 httpClient = &http.Client{Transport: rt} 108 } 109 if user != "" { 110 h.Set("Authorization", "Basic "+base64.StdEncoding.EncodeToString([]byte(user+":"+pass))) 111 } 112 if tenantID != "" { 113 h.Set("X-Scope-OrgID", tenantID) 114 } 115 116 next := time.Now() 117 bkcfg := backoff.Config{ 118 MinBackoff: 1 * time.Second, 119 MaxBackoff: 10 * time.Minute, 120 MaxRetries: 0, 121 } 122 bkoff := backoff.New(context.Background(), bkcfg) 123 124 rd := Reader{ 125 header: h, 126 useTLS: useTLS, 127 clientTLSConfig: tlsConfig, 128 caFile: caFile, 129 addr: address, 130 user: user, 131 pass: pass, 132 tenantID: tenantID, 133 queryTimeout: queryTimeout, 134 httpClient: httpClient, 135 sName: streamName, 136 sValue: streamValue, 137 lName: labelName, 138 lVal: labelVal, 139 nextQuery: next, 140 backoff: bkoff, 141 interval: interval, 142 w: writer, 143 recv: receivedChan, 144 quit: make(chan struct{}), 145 done: make(chan struct{}), 146 shuttingDown: false, 147 } 148 149 go rd.run() 150 151 go func() { 152 <-rd.quit 153 if rd.conn != nil { 154 fmt.Fprintf(rd.w, "shutting down reader\n") 155 rd.shuttingDown = true 156 _ = rd.conn.Close() 157 } 158 }() 159 160 return &rd, nil 161 } 162 163 func (r *Reader) Stop() { 164 if r.quit != nil { 165 close(r.quit) 166 <-r.done 167 r.quit = nil 168 } 169 } 170 171 // QueryCountOverTime will ask Loki for a count of logs over the provided range e.g. 5m 172 // QueryCountOverTime blocks if a previous query has failed until the appropriate backoff time has been reached. 173 func (r *Reader) QueryCountOverTime(queryRange string) (float64, error) { 174 r.backoffMtx.RLock() 175 next := r.nextQuery 176 r.backoffMtx.RUnlock() 177 for time.Now().Before(next) { 178 time.Sleep(50 * time.Millisecond) 179 // Update next in case other queries have tried and failed 180 r.backoffMtx.RLock() 181 next = r.nextQuery 182 r.backoffMtx.RUnlock() 183 } 184 185 scheme := "http" 186 if r.useTLS { 187 scheme = "https" 188 } 189 u := url.URL{ 190 Scheme: scheme, 191 Host: r.addr, 192 Path: "/loki/api/v1/query", 193 RawQuery: "query=" + url.QueryEscape(fmt.Sprintf("count_over_time({%v=\"%v\",%v=\"%v\"}[%s])", r.sName, r.sValue, r.lName, r.lVal, queryRange)) + 194 fmt.Sprintf("&time=%d", time.Now().UnixNano()) + 195 "&limit=1000", 196 } 197 fmt.Fprintf(r.w, "Querying loki for metric count with query: %v\n", u.String()) 198 199 ctx, cancel := context.WithTimeout(context.Background(), r.queryTimeout) 200 defer cancel() 201 202 req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) 203 if err != nil { 204 return 0, err 205 } 206 207 if r.user != "" { 208 req.SetBasicAuth(r.user, r.pass) 209 } 210 if r.tenantID != "" { 211 req.Header.Set("X-Scope-OrgID", r.tenantID) 212 } 213 req.Header.Set("User-Agent", userAgent) 214 215 resp, err := r.httpClient.Do(req) 216 if err != nil { 217 return 0, errors.Wrap(err, "query request failed") 218 } 219 defer func() { 220 if err := resp.Body.Close(); err != nil { 221 log.Println("error closing body", err) 222 } 223 }() 224 225 if resp.StatusCode/100 != 2 { 226 r.backoffMtx.Lock() 227 r.nextQuery = nextBackoff(r.w, resp.StatusCode, r.backoff) 228 r.backoffMtx.Unlock() 229 buf, _ := ioutil.ReadAll(resp.Body) 230 return 0, fmt.Errorf("error response from server: %s (%v)", string(buf), err) 231 } 232 // No Errors, reset backoff 233 r.backoffMtx.Lock() 234 r.backoff.Reset() 235 r.backoffMtx.Unlock() 236 237 var decoded loghttp.QueryResponse 238 err = json.NewDecoder(resp.Body).Decode(&decoded) 239 if err != nil { 240 return 0, err 241 } 242 243 value := decoded.Data.Result 244 ret := 0.0 245 switch value.Type() { 246 case loghttp.ResultTypeVector: 247 series := value.(loghttp.Vector) 248 if len(series) > 1 { 249 return 0, fmt.Errorf("expected only a single series result in the metric test query vector, instead received %v", len(series)) 250 } 251 if len(series) == 0 { 252 return 0, fmt.Errorf("expected to receive one sample in the result vector, received 0") 253 } 254 ret = float64(series[0].Value) 255 default: 256 return 0, fmt.Errorf("unexpected result type, expected a Vector result instead received %v", value.Type()) 257 } 258 259 return ret, nil 260 } 261 262 // Query will ask Loki for all canary timestamps in the requested timerange. 263 // Query blocks if a previous query has failed until the appropriate backoff time has been reached. 264 func (r *Reader) Query(start time.Time, end time.Time) ([]time.Time, error) { 265 r.backoffMtx.RLock() 266 next := r.nextQuery 267 r.backoffMtx.RUnlock() 268 for time.Now().Before(next) { 269 time.Sleep(50 * time.Millisecond) 270 // Update next in case other queries have tried and failed moving it even farther in the future 271 r.backoffMtx.RLock() 272 next = r.nextQuery 273 r.backoffMtx.RUnlock() 274 } 275 276 scheme := "http" 277 if r.useTLS { 278 scheme = "https" 279 } 280 u := url.URL{ 281 Scheme: scheme, 282 Host: r.addr, 283 Path: "/loki/api/v1/query_range", 284 RawQuery: fmt.Sprintf("start=%d&end=%d", start.UnixNano(), end.UnixNano()) + 285 "&query=" + url.QueryEscape(fmt.Sprintf("{%v=\"%v\",%v=\"%v\"}", r.sName, r.sValue, r.lName, r.lVal)) + 286 "&limit=1000", 287 } 288 fmt.Fprintf(r.w, "Querying loki for logs with query: %v\n", u.String()) 289 290 ctx, cancel := context.WithTimeout(context.Background(), r.queryTimeout) 291 defer cancel() 292 293 req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) 294 if err != nil { 295 return nil, err 296 } 297 298 if r.user != "" { 299 req.SetBasicAuth(r.user, r.pass) 300 } 301 if r.tenantID != "" { 302 req.Header.Set("X-Scope-OrgID", r.tenantID) 303 } 304 req.Header.Set("User-Agent", userAgent) 305 306 resp, err := r.httpClient.Do(req) 307 if err != nil { 308 return nil, errors.Wrap(err, "query_range request failed") 309 } 310 defer func() { 311 if err := resp.Body.Close(); err != nil { 312 log.Println("error closing body", err) 313 } 314 }() 315 316 if resp.StatusCode/100 != 2 { 317 r.backoffMtx.Lock() 318 r.nextQuery = nextBackoff(r.w, resp.StatusCode, r.backoff) 319 r.backoffMtx.Unlock() 320 buf, _ := ioutil.ReadAll(resp.Body) 321 return nil, fmt.Errorf("error response from server: %s (%v)", string(buf), err) 322 } 323 // No Errors, reset backoff 324 r.backoffMtx.Lock() 325 r.backoff.Reset() 326 r.backoffMtx.Unlock() 327 328 var decoded loghttp.QueryResponse 329 err = json.NewDecoder(resp.Body).Decode(&decoded) 330 if err != nil { 331 return nil, err 332 } 333 334 tss := []time.Time{} 335 value := decoded.Data.Result 336 switch value.Type() { 337 case logqlmodel.ValueTypeStreams: 338 for _, stream := range value.(loghttp.Streams) { 339 for _, entry := range stream.Entries { 340 ts, err := parseResponse(&entry) 341 if err != nil { 342 fmt.Fprint(r.w, err) 343 continue 344 } 345 tss = append(tss, *ts) 346 } 347 } 348 default: 349 return nil, fmt.Errorf("unexpected result type, expected a log stream result instead received %v", value.Type()) 350 } 351 352 return tss, nil 353 } 354 355 // run uses the established websocket connection to tail logs from Loki 356 func (r *Reader) run() { 357 r.closeAndReconnect() 358 359 tailResponse := &loghttp.TailResponse{} 360 lastMessageTs := time.Now() 361 362 for { 363 if r.shuttingDown { 364 if r.conn != nil { 365 _ = r.conn.Close() 366 r.conn = nil 367 } 368 close(r.done) 369 return 370 } 371 372 // Set a read timeout of 10x the interval we expect to see messages 373 // Ignore the error as it will get caught when we call ReadJSON 374 timeoutInterval := 10 * r.interval 375 _ = r.conn.SetReadDeadline(time.Now().Add(timeoutInterval)) 376 // I assume this is a blocking call that either reads from the websocket connection 377 // or times out based on the above SetReadDeadline call. 378 err := unmarshal.ReadTailResponseJSON(tailResponse, r.conn) 379 if err != nil { 380 reason := "error reading websocket" 381 if e, ok := err.(net.Error); ok && e.Timeout() { 382 reason = fmt.Sprintf("timeout tailing new logs (timeout period: %.2fs)", timeoutInterval.Seconds()) 383 } 384 385 fmt.Fprintf(r.w, "%s, will retry in 10 seconds: %s\n", reason, err) 386 387 // Even though we sleep between connection retries, we found it's possible to DOS Loki if the connection 388 // succeeds but some other error is returned, so also sleep here before retrying. 389 <-time.After(10 * time.Second) 390 r.closeAndReconnect() 391 continue 392 } 393 // If there were streams update last message timestamp 394 if len(tailResponse.Streams) > 0 { 395 lastMessageTs = time.Now() 396 } 397 for _, stream := range tailResponse.Streams { 398 for _, entry := range stream.Entries { 399 ts, err := parseResponse(&entry) 400 if err != nil { 401 fmt.Fprint(r.w, err) 402 continue 403 } 404 r.recv <- *ts 405 } 406 } 407 // Ping messages can reset the read deadline so also make sure we are receiving regular messages. 408 // We use the same 10x interval to make sure we are getting recent messages 409 if time.Since(lastMessageTs).Milliseconds() > 10*r.interval.Milliseconds() { 410 fmt.Fprintf(r.w, "Have not received a canary message from loki on the websocket in %vms, "+ 411 "sleeping 10s and reconnecting\n", 10*r.interval.Milliseconds()) 412 <-time.After(10 * time.Second) 413 r.closeAndReconnect() 414 continue 415 } 416 } 417 } 418 419 // closeAndReconnect establishes a web socket connection to Loki and sets up a tail query 420 // with the stream and label selectors which can be used check if all logs generated by the 421 // canary have been received by Loki. 422 func (r *Reader) closeAndReconnect() { 423 if r.shuttingDown { 424 return 425 } 426 if r.conn != nil { 427 _ = r.conn.Close() 428 r.conn = nil 429 // By incrementing reconnects here we should only count a failure followed by a successful reconnect. 430 // Initial connections and reconnections from failed tries will not be counted. 431 reconnects.Inc() 432 } 433 for r.conn == nil { 434 scheme := "ws" 435 if r.useTLS { 436 scheme = "wss" 437 } 438 u := url.URL{ 439 Scheme: scheme, 440 Host: r.addr, 441 Path: "/loki/api/v1/tail", 442 RawQuery: "query=" + url.QueryEscape(fmt.Sprintf("{%v=\"%v\",%v=\"%v\"}", r.sName, r.sValue, r.lName, r.lVal)), 443 } 444 445 fmt.Fprintf(r.w, "Connecting to loki at %v, querying for label '%v' with value '%v'\n", u.String(), r.lName, r.lVal) 446 447 dialer := r.webSocketDialer() 448 c, _, err := dialer.Dial(u.String(), r.header) 449 if err != nil { 450 fmt.Fprintf(r.w, "failed to connect to %s with err %s\n", u.String(), err) 451 <-time.After(10 * time.Second) 452 continue 453 } 454 // Use a custom ping handler so we can increment a metric, this is copied from the default ping handler 455 c.SetPingHandler(func(message string) error { 456 websocketPings.Inc() 457 err := c.WriteControl(websocket.PongMessage, []byte(message), time.Now().Add(time.Second)) 458 if err == websocket.ErrCloseSent { 459 return nil 460 } else if e, ok := err.(net.Error); ok && e.Temporary() { 461 return nil 462 } 463 return err 464 }) 465 r.conn = c 466 } 467 } 468 469 // webSocketDialer creates a dialer for the web socket connection to Loki 470 // websocket.DefaultDialer will be returned in the case that the connection to Loki is http or TLS without client certs. 471 // For the mTLS case, return a websocket.Dialer configured to use client side certificates. 472 func (r *Reader) webSocketDialer() *websocket.Dialer { 473 return &websocket.Dialer{ 474 Proxy: http.ProxyFromEnvironment, 475 TLSClientConfig: r.clientTLSConfig, 476 HandshakeTimeout: 45 * time.Second, 477 } 478 } 479 480 func parseResponse(entry *loghttp.Entry) (*time.Time, error) { 481 sp := strings.Split(entry.Line, " ") 482 if len(sp) != 2 { 483 return nil, errors.Errorf("received invalid entry: %s", entry.Line) 484 } 485 ts, err := strconv.ParseInt(sp[0], 10, 64) 486 if err != nil { 487 return nil, errors.Errorf("failed to parse timestamp: %s", sp[0]) 488 } 489 t := time.Unix(0, ts) 490 return &t, nil 491 } 492 493 func nextBackoff(w io.Writer, statusCode int, backoff *backoff.Backoff) time.Time { 494 // Be way more conservative with an http 429 and wait 5 minutes before trying again. 495 var next time.Time 496 if statusCode == http.StatusTooManyRequests { 497 next = time.Now().Add(5 * time.Minute) 498 } else { 499 next = time.Now().Add(backoff.NextDelay()) 500 } 501 fmt.Fprintf(w, "Loki returned an error code: %v, waiting %v before next query.", statusCode, time.Until(next)) 502 return next 503 }