github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/apiserver/logsink/logsink.go

github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/apiserver/logsink/logsink.go (about)

     1  // Copyright 2015-2017 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package logsink
     5  
     6  import (
     7  	"io"
     8  	"net/http"
     9  	"sync"
    10  	"time"
    11  
    12  	gorillaws "github.com/gorilla/websocket"
    13  	"github.com/juju/clock"
    14  	"github.com/juju/errors"
    15  	"github.com/juju/featureflag"
    16  	"github.com/juju/loggo"
    17  	"github.com/juju/ratelimit"
    18  	"github.com/prometheus/client_golang/prometheus"
    19  
    20  	"github.com/juju/juju/apiserver/httpcontext"
    21  	"github.com/juju/juju/apiserver/websocket"
    22  	"github.com/juju/juju/feature"
    23  	"github.com/juju/juju/rpc/params"
    24  )
    25  
    26  const (
    27  	metricLogWriteLabelSuccess = "success"
    28  	metricLogWriteLabelFailure = "failure"
    29  )
    30  
    31  const (
    32  	metricLogReadLabelSuccess    = "success"
    33  	metricLogReadLabelError      = "error"
    34  	metricLogReadLabelDisconnect = "disconnect"
    35  )
    36  
    37  var logger = loggo.GetLogger("juju.apiserver.logsink")
    38  
    39  // LogWriteCloser provides an interface for persisting log records.
    40  // The LogCloser's Close method should be called to release any
    41  // resources once it is done with.
    42  type LogWriteCloser interface {
    43  	io.Closer
    44  
    45  	// WriteLog writes out the given log record.
    46  	WriteLog(params.LogRecord) error
    47  }
    48  
    49  // NewLogWriteCloserFunc returns a new LogWriteCloser for the given http.Request.
    50  type NewLogWriteCloserFunc func(*http.Request) (LogWriteCloser, error)
    51  
    52  // RateLimitConfig contains the rate-limit configuration for the logsink
    53  // handler.
    54  type RateLimitConfig struct {
    55  	// Burst is the number of log messages that will be let through before
    56  	// we start rate limiting.
    57  	Burst int64
    58  
    59  	// Refill is the rate at which log messages will be let through once
    60  	// the initial burst amount has been depleted.
    61  	Refill time.Duration
    62  
    63  	// Clock is the clock used to wait when rate-limiting log receives.
    64  	Clock clock.Clock
    65  }
    66  
    67  // CounterVec is a Collector that bundles a set of Counters that all share the
    68  // same description.
    69  type CounterVec interface {
    70  	// With returns a Counter for a given labels slice
    71  	With(prometheus.Labels) prometheus.Counter
    72  }
    73  
    74  // GaugeVec is a Collector that bundles a set of Gauges that all share the
    75  // same description.
    76  type GaugeVec interface {
    77  	// With returns a Gauge for a given labels slice
    78  	With(prometheus.Labels) prometheus.Gauge
    79  }
    80  
    81  // MetricsCollector represents a way to change the metrics for the logsink
    82  // api handler.
    83  //
    84  //go:generate go run go.uber.org/mock/mockgen -package mocks -destination mocks/metrics_collector_mock.go github.com/juju/juju/apiserver/logsink MetricsCollector
    85  //go:generate go run go.uber.org/mock/mockgen -package mocks -destination mocks/metrics_mock.go github.com/prometheus/client_golang/prometheus Counter,Gauge
    86  type MetricsCollector interface {
    87  
    88  	// TotalConnections returns a prometheus metric that can be incremented
    89  	// as a counter for the total number connections being served from the api
    90  	// handler.
    91  	TotalConnections() prometheus.Counter
    92  
    93  	// Connections returns a prometheus metric that can be incremented and
    94  	// decremented as a gauge, for the number connections being current served
    95  	// from the api handler.
    96  	Connections() prometheus.Gauge
    97  
    98  	// PingFailureCount returns a prometheus metric for the number of
    99  	// ping failures per model uuid, that can be incremented as
   100  	// a counter.
   101  	PingFailureCount(modelUUID string) prometheus.Counter
   102  
   103  	// LogWriteCount returns a prometheus metric for the number of writes to
   104  	// the log that happened. It's split on the success/failure, so the charts
   105  	// will have to take that into account.
   106  	LogWriteCount(modelUUID, state string) prometheus.Counter
   107  
   108  	// LogReadCount returns a prometheus metric for the number of reads to
   109  	// the log that happened. It's split on the success/error/disconnect, so
   110  	// the charts will have to take that into account.
   111  	LogReadCount(modelUUID, state string) prometheus.Counter
   112  }
   113  
   114  // NewHTTPHandler returns a new http.Handler for receiving log messages over a
   115  // websocket, using the given NewLogWriteCloserFunc to obtain a writer to which
   116  // the log messages will be written.
   117  //
   118  // ratelimit defines an optional rate-limit configuration. If nil, no rate-
   119  // limiting will be applied.
   120  func NewHTTPHandler(
   121  	newLogWriteCloser NewLogWriteCloserFunc,
   122  	abort <-chan struct{},
   123  	ratelimit *RateLimitConfig,
   124  	metrics MetricsCollector,
   125  	modelUUID string,
   126  ) http.Handler {
   127  	return &logSinkHandler{
   128  		newLogWriteCloser: newLogWriteCloser,
   129  		abort:             abort,
   130  		ratelimit:         ratelimit,
   131  		newStopChannel: func() (chan struct{}, func()) {
   132  			ch := make(chan struct{})
   133  			return ch, func() { close(ch) }
   134  		},
   135  		metrics:   metrics,
   136  		modelUUID: modelUUID,
   137  	}
   138  }
   139  
   140  type logSinkHandler struct {
   141  	newLogWriteCloser NewLogWriteCloserFunc
   142  	abort             <-chan struct{}
   143  	ratelimit         *RateLimitConfig
   144  	metrics           MetricsCollector
   145  	modelUUID         string
   146  	mu                sync.Mutex
   147  
   148  	// newStopChannel is overridden in tests so that we can check the
   149  	// goroutine exits when prompted.
   150  	newStopChannel  func() (chan struct{}, func())
   151  	receiverStopped bool
   152  }
   153  
   154  // Since the logsink only receives messages, it is possible for the other end
   155  // to disappear without the server noticing. To fix this, we use the
   156  // underlying websocket control messages ping/pong. Periodically the server
   157  // writes a ping, and the other end replies with a pong. Now the tricky bit is
   158  // that it appears in all the examples found on the interweb that it is
   159  // possible for the control message to be sent successfully to something that
   160  // isn't entirely alive, which is why relying on an error return from the
   161  // write call is insufficient to mark the connection as dead. Instead the
   162  // write and read deadlines inherent in the underlying Go networking libraries
   163  // are used to force errors on timeouts. However the underlying network
   164  // libraries use time.Now() to determine whether or not to send errors, so
   165  // using a testing clock here isn't going to work. So we rely on manual
   166  // testing, and what is defined as good practice by the library authors.
   167  //
   168  // Now, in theory, we should be using this ping/pong across all the websockets,
   169  // but that is a little outside the scope of this piece of work.
   170  
   171  const (
   172  	// For endpoints that don't support ping/pong (i.e. agents prior to 2.2-beta1)
   173  	// we will time out their connections after six hours of inactivity.
   174  	vZeroDelay = 6 * time.Hour
   175  )
   176  
   177  // ServeHTTP implements the http.Handler interface.
   178  func (h *logSinkHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
   179  	// If the modelUUID from the request is empty, fallback to the one in
   180  	// the logsink handler (controller UUID)
   181  	resolvedModelUUID := h.modelUUID
   182  	if modelUUID := httpcontext.RequestModelUUID(req); modelUUID != "" {
   183  		resolvedModelUUID = modelUUID
   184  	}
   185  
   186  	handler := func(socket *websocket.Conn) {
   187  		h.metrics.TotalConnections().Inc()
   188  		h.metrics.Connections().Inc()
   189  		defer h.metrics.Connections().Dec()
   190  
   191  		defer socket.Close()
   192  		endpointVersion, err := h.getVersion(req)
   193  		if err != nil {
   194  			h.sendError(socket, req, err)
   195  			return
   196  		}
   197  		writer, err := h.newLogWriteCloser(req)
   198  		if err != nil {
   199  			h.sendError(socket, req, err)
   200  			return
   201  		}
   202  		defer writer.Close()
   203  
   204  		// If we get to here, no more errors to report, so we report a nil
   205  		// error.  This way the first line of the socket is always a json
   206  		// formatted simple error.
   207  		h.sendError(socket, req, nil)
   208  
   209  		// Here we configure the ping/pong handling for the websocket so the
   210  		// server can notice when the client goes away. Older versions did not
   211  		// respond to ping control messages, so don't try.
   212  		var tickChannel <-chan time.Time
   213  		if endpointVersion > 0 {
   214  			_ = socket.SetReadDeadline(time.Now().Add(websocket.PongDelay))
   215  			socket.SetPongHandler(func(string) error {
   216  				logger.Tracef("pong logsink %p", socket)
   217  				_ = socket.SetReadDeadline(time.Now().Add(websocket.PongDelay))
   218  				return nil
   219  			})
   220  			ticker := time.NewTicker(websocket.PingPeriod)
   221  			defer ticker.Stop()
   222  			tickChannel = ticker.C
   223  		} else {
   224  			_ = socket.SetReadDeadline(time.Now().Add(vZeroDelay))
   225  		}
   226  
   227  		stopReceiving, closer := h.newStopChannel()
   228  		defer closer()
   229  		logCh := h.receiveLogs(socket, endpointVersion, resolvedModelUUID, stopReceiving)
   230  		for {
   231  			select {
   232  			case <-h.abort:
   233  				return
   234  			case <-tickChannel:
   235  				deadline := time.Now().Add(websocket.WriteWait)
   236  				logger.Tracef("ping logsink %p", socket)
   237  				if err := socket.WriteControl(gorillaws.PingMessage, []byte{}, deadline); err != nil {
   238  					// This error is expected if the other end goes away. By
   239  					// returning we clean up the strategy and close the socket
   240  					// through the defer calls.
   241  					logger.Debugf("failed to write ping: %s", err)
   242  					// Bump the ping failure count.
   243  					h.metrics.PingFailureCount(resolvedModelUUID).Inc()
   244  					return
   245  				}
   246  			case m, ok := <-logCh:
   247  				if !ok {
   248  					h.mu.Lock()
   249  					defer h.mu.Unlock()
   250  					h.receiverStopped = true
   251  					return
   252  				}
   253  
   254  				if err := writer.WriteLog(m); err != nil {
   255  					h.sendError(socket, req, err)
   256  					// Increment the number of failure cases per modelUUID, that
   257  					// we where unable to write a log to - note: we won't see
   258  					// why the failure happens, only that it did happen. Maybe
   259  					// we should add a trace log here. Developer mode for send
   260  					// error might help if it was enabled at first ?
   261  					h.metrics.LogWriteCount(resolvedModelUUID, metricLogWriteLabelFailure).Inc()
   262  					return
   263  				}
   264  
   265  				// Increment the number of successful modelUUID log writes, so
   266  				// that we can see what's a success over failure case
   267  				h.metrics.LogWriteCount(resolvedModelUUID, metricLogWriteLabelSuccess).Inc()
   268  			}
   269  		}
   270  	}
   271  	websocket.Serve(w, req, handler)
   272  }
   273  
   274  func (h *logSinkHandler) getVersion(req *http.Request) (int, error) {
   275  	verStr := req.URL.Query().Get("version")
   276  	switch verStr {
   277  	case "":
   278  		return 0, nil
   279  	case "1":
   280  		return 1, nil
   281  	default:
   282  		return 0, errors.Errorf("unknown version %q", verStr)
   283  	}
   284  }
   285  
   286  func (h *logSinkHandler) receiveLogs(socket *websocket.Conn,
   287  	endpointVersion int,
   288  	resolvedModelUUID string,
   289  	stop <-chan struct{},
   290  ) <-chan params.LogRecord {
   291  	logCh := make(chan params.LogRecord)
   292  
   293  	var tokenBucket *ratelimit.Bucket
   294  	if h.ratelimit != nil {
   295  		tokenBucket = ratelimit.NewBucketWithClock(
   296  			h.ratelimit.Refill,
   297  			h.ratelimit.Burst,
   298  			ratelimitClock{h.ratelimit.Clock},
   299  		)
   300  	}
   301  
   302  	go func() {
   303  		// Close the channel to signal ServeHTTP to finish. Otherwise
   304  		// we leak goroutines on client disconnect, because the server
   305  		// isn't shutting down so h.abort is never closed.
   306  		defer close(logCh)
   307  		for {
   308  			// Receive() blocks until data arrives but will also be
   309  			// unblocked when the API handler calls socket.Close as it
   310  			// finishes.
   311  
   312  			// Do not lift the LogRecord outside of the for-loop as any fields
   313  			// with omitempty will not get cleared between iterations. The
   314  			// logsink has to work with different versions of juju we need to
   315  			// ensure that we have consistent data, even at the cost of
   316  			// performance.
   317  			var m params.LogRecord
   318  			if err := socket.ReadJSON(&m); err != nil {
   319  				if gorillaws.IsCloseError(err, gorillaws.CloseNormalClosure, gorillaws.CloseGoingAway) {
   320  					logger.Tracef("logsink closed: %v", err)
   321  					h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelDisconnect).Inc()
   322  				} else if gorillaws.IsUnexpectedCloseError(err, gorillaws.CloseNormalClosure, gorillaws.CloseGoingAway) {
   323  					logger.Debugf("logsink unexpected close error: %v", err)
   324  					h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelError).Inc()
   325  				} else {
   326  					logger.Debugf("logsink error: %v", err)
   327  					h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelError).Inc()
   328  				}
   329  				// Try to tell the other end we are closing. If the other end
   330  				// has already disconnected from us, this will fail, but we don't
   331  				// care that much.
   332  				h.mu.Lock()
   333  				defer h.mu.Unlock()
   334  				_ = socket.WriteMessage(gorillaws.CloseMessage, []byte{})
   335  				return
   336  			}
   337  			h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelSuccess).Inc()
   338  
   339  			// Rate-limit receipt of log messages. We rate-limit
   340  			// each connection individually to prevent one noisy
   341  			// individual from drowning out the others.
   342  			if tokenBucket != nil {
   343  				if d := tokenBucket.Take(1); d > 0 {
   344  					select {
   345  					case <-h.ratelimit.Clock.After(d):
   346  					case <-h.abort:
   347  						return
   348  					}
   349  				}
   350  			}
   351  
   352  			// Send the log message.
   353  			select {
   354  			case <-h.abort:
   355  				// The API server is stopping.
   356  				return
   357  			case <-stop:
   358  				// The ServeHTTP handler has stopped.
   359  				return
   360  			case logCh <- m:
   361  				// If the remote end does not support ping/pong, we bump
   362  				// the read deadline everytime a message is received.
   363  				if endpointVersion == 0 {
   364  					_ = socket.SetReadDeadline(time.Now().Add(vZeroDelay))
   365  				}
   366  			}
   367  		}
   368  	}()
   369  
   370  	return logCh
   371  }
   372  
   373  // sendError sends a JSON-encoded error response.
   374  func (h *logSinkHandler) sendError(ws *websocket.Conn, req *http.Request, err error) {
   375  	// There is no need to log the error for normal operators as there is nothing
   376  	// they can action. This is for developers.
   377  	if err != nil && featureflag.Enabled(feature.DeveloperMode) {
   378  		logger.Errorf("returning error from %s %s: %s", req.Method, req.URL.Path, errors.Details(err))
   379  	}
   380  	h.mu.Lock()
   381  	defer h.mu.Unlock()
   382  	if sendErr := ws.SendInitialErrorV0(err); sendErr != nil {
   383  		logger.Errorf("closing websocket, %v", err)
   384  		ws.Close()
   385  	}
   386  }
   387  
   388  // ratelimitClock adapts clock.Clock to ratelimit.Clock.
   389  type ratelimitClock struct {
   390  	clock.Clock
   391  }
   392  
   393  // Sleep is defined by the ratelimit.Clock interface.
   394  func (c ratelimitClock) Sleep(d time.Duration) {
   395  	<-c.Clock.After(d)
   396  }