github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/apiserver/logsink/logsink.go

github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/apiserver/logsink/logsink.go (about)

     1  // Copyright 2015-2017 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package logsink
     5  
     6  import (
     7  	"io"
     8  	"net/http"
     9  	"sync"
    10  	"time"
    11  
    12  	gorillaws "github.com/gorilla/websocket"
    13  	"github.com/juju/clock"
    14  	"github.com/juju/errors"
    15  	"github.com/juju/loggo"
    16  	"github.com/juju/ratelimit"
    17  	"github.com/juju/utils/featureflag"
    18  	"github.com/juju/version"
    19  	"github.com/prometheus/client_golang/prometheus"
    20  
    21  	"github.com/juju/juju/apiserver/httpcontext"
    22  	"github.com/juju/juju/apiserver/params"
    23  	"github.com/juju/juju/apiserver/websocket"
    24  	"github.com/juju/juju/feature"
    25  )
    26  
    27  const (
    28  	metricLogWriteLabelSuccess = "success"
    29  	metricLogWriteLabelFailure = "failure"
    30  )
    31  
    32  const (
    33  	metricLogReadLabelError      = "error"
    34  	metricLogReadLabelDisconnect = "disconnect"
    35  )
    36  
    37  var logger = loggo.GetLogger("juju.apiserver.logsink")
    38  
    39  // LogWriteCloser provides an interface for persisting log records.
    40  // The LogCloser's Close method should be called to release any
    41  // resources once it is done with.
    42  type LogWriteCloser interface {
    43  	io.Closer
    44  
    45  	// WriteLog writes out the given log record.
    46  	WriteLog(params.LogRecord) error
    47  }
    48  
    49  // NewLogWriteCloserFunc returns a new LogWriteCloser for the given http.Request.
    50  type NewLogWriteCloserFunc func(*http.Request) (LogWriteCloser, error)
    51  
    52  // RateLimitConfig contains the rate-limit configuration for the logsink
    53  // handler.
    54  type RateLimitConfig struct {
    55  	// Burst is the number of log messages that will be let through before
    56  	// we start rate limiting.
    57  	Burst int64
    58  
    59  	// Refill is the rate at which log messages will be let through once
    60  	// the initial burst amount has been depleted.
    61  	Refill time.Duration
    62  
    63  	// Clock is the clock used to wait when rate-limiting log receives.
    64  	Clock clock.Clock
    65  }
    66  
    67  // CounterVec is a Collector that bundles a set of Counters that all share the
    68  // same description.
    69  type CounterVec interface {
    70  	// With returns a Counter for a given labels slice
    71  	With(prometheus.Labels) prometheus.Counter
    72  }
    73  
    74  // GaugeVec is a Collector that bundles a set of Gauges that all share the
    75  // same description.
    76  type GaugeVec interface {
    77  	// With returns a Gauge for a given labels slice
    78  	With(prometheus.Labels) prometheus.Gauge
    79  }
    80  
    81  // MetricsCollector represents a way to change the metrics for the logsink
    82  // api handler.
    83  //go:generate mockgen -package mocks -destination mocks/metrics_collector_mock.go github.com/juju/juju/apiserver/logsink MetricsCollector
    84  //go:generate mockgen -package mocks -destination mocks/metrics_mock.go github.com/prometheus/client_golang/prometheus Counter,Gauge
    85  type MetricsCollector interface {
    86  
    87  	// TotalConnections returns a prometheus metric that can be incremented
    88  	// as a counter for the total number connections being served from the api
    89  	// handler.
    90  	TotalConnections() prometheus.Counter
    91  
    92  	// Connections returns a prometheus metric that can be incremented and
    93  	// decremented as a gauge, for the number connections being current served
    94  	// from the api handler.
    95  	Connections() prometheus.Gauge
    96  
    97  	// PingFailureCount returns a prometheus metric for the number of
    98  	// ping failures per model uuid, that can be incremented as
    99  	// a counter.
   100  	PingFailureCount(modelUUID string) prometheus.Counter
   101  
   102  	// LogWriteCount returns a prometheus metric for the number of writes to
   103  	// the log that happened. It's split on the success/failure, so the charts
   104  	// will have to take that into account.
   105  	LogWriteCount(modelUUID, state string) prometheus.Counter
   106  
   107  	// LogReadCount returns a prometheus metric for the number of reads to
   108  	// the log that happened. It's split on the success/error/disconnect, so
   109  	// the charts will have to take that into account.
   110  	LogReadCount(modelUUID, state string) prometheus.Counter
   111  }
   112  
   113  // NewHTTPHandler returns a new http.Handler for receiving log messages over a
   114  // websocket, using the given NewLogWriteCloserFunc to obtain a writer to which
   115  // the log messages will be written.
   116  //
   117  // ratelimit defines an optional rate-limit configuration. If nil, no rate-
   118  // limiting will be applied.
   119  func NewHTTPHandler(
   120  	newLogWriteCloser NewLogWriteCloserFunc,
   121  	abort <-chan struct{},
   122  	ratelimit *RateLimitConfig,
   123  	metrics MetricsCollector,
   124  	modelUUID string,
   125  ) http.Handler {
   126  	return &logSinkHandler{
   127  		newLogWriteCloser: newLogWriteCloser,
   128  		abort:             abort,
   129  		ratelimit:         ratelimit,
   130  		newStopChannel: func() (chan struct{}, func()) {
   131  			ch := make(chan struct{})
   132  			return ch, func() { close(ch) }
   133  		},
   134  		metrics:   metrics,
   135  		modelUUID: modelUUID,
   136  	}
   137  }
   138  
   139  type logSinkHandler struct {
   140  	newLogWriteCloser NewLogWriteCloserFunc
   141  	abort             <-chan struct{}
   142  	ratelimit         *RateLimitConfig
   143  	metrics           MetricsCollector
   144  	modelUUID         string
   145  	mu                sync.Mutex
   146  
   147  	// newStopChannel is overridden in tests so that we can check the
   148  	// goroutine exits when prompted.
   149  	newStopChannel  func() (chan struct{}, func())
   150  	receiverStopped bool
   151  }
   152  
   153  // Since the logsink only receives messages, it is possible for the other end
   154  // to disappear without the server noticing. To fix this, we use the
   155  // underlying websocket control messages ping/pong. Periodically the server
   156  // writes a ping, and the other end replies with a pong. Now the tricky bit is
   157  // that it appears in all the examples found on the interweb that it is
   158  // possible for the control message to be sent successfully to something that
   159  // isn't entirely alive, which is why relying on an error return from the
   160  // write call is insufficient to mark the connection as dead. Instead the
   161  // write and read deadlines inherent in the underlying Go networking libraries
   162  // are used to force errors on timeouts. However the underlying network
   163  // libraries use time.Now() to determine whether or not to send errors, so
   164  // using a testing clock here isn't going to work. So we rely on manual
   165  // testing, and what is defined as good practice by the library authors.
   166  //
   167  // Now, in theory, we should be using this ping/pong across all the websockets,
   168  // but that is a little outside the scope of this piece of work.
   169  
   170  const (
   171  	// For endpoints that don't support ping/pong (i.e. agents prior to 2.2-beta1)
   172  	// we will time out their connections after six hours of inactivity.
   173  	vZeroDelay = 6 * time.Hour
   174  )
   175  
   176  // ServeHTTP implements the http.Handler interface.
   177  func (h *logSinkHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
   178  	// If the modelUUID from the request is empty, fallback to the one in
   179  	// the logsink handler (controller UUID)
   180  	resolvedModelUUID := h.modelUUID
   181  	if modelUUID := httpcontext.RequestModelUUID(req); modelUUID != "" {
   182  		resolvedModelUUID = modelUUID
   183  	}
   184  
   185  	handler := func(socket *websocket.Conn) {
   186  		h.metrics.TotalConnections().Inc()
   187  		h.metrics.Connections().Inc()
   188  		defer h.metrics.Connections().Dec()
   189  
   190  		defer socket.Close()
   191  		endpointVersion, err := h.getVersion(req)
   192  		if err != nil {
   193  			h.sendError(socket, req, err)
   194  			return
   195  		}
   196  		writer, err := h.newLogWriteCloser(req)
   197  		if err != nil {
   198  			h.sendError(socket, req, err)
   199  			return
   200  		}
   201  		defer writer.Close()
   202  
   203  		// If we get to here, no more errors to report, so we report a nil
   204  		// error.  This way the first line of the socket is always a json
   205  		// formatted simple error.
   206  		h.sendError(socket, req, nil)
   207  
   208  		// Here we configure the ping/pong handling for the websocket so the
   209  		// server can notice when the client goes away. Older versions did not
   210  		// respond to ping control messages, so don't try.
   211  		var tickChannel <-chan time.Time
   212  		if endpointVersion > 0 {
   213  			socket.SetReadDeadline(time.Now().Add(websocket.PongDelay))
   214  			socket.SetPongHandler(func(string) error {
   215  				logger.Tracef("pong logsink %p", socket)
   216  				socket.SetReadDeadline(time.Now().Add(websocket.PongDelay))
   217  				return nil
   218  			})
   219  			ticker := time.NewTicker(websocket.PingPeriod)
   220  			defer ticker.Stop()
   221  			tickChannel = ticker.C
   222  		} else {
   223  			socket.SetReadDeadline(time.Now().Add(vZeroDelay))
   224  		}
   225  
   226  		stopReceiving, closer := h.newStopChannel()
   227  		defer closer()
   228  		logCh := h.receiveLogs(socket, endpointVersion, resolvedModelUUID, stopReceiving)
   229  		for {
   230  			select {
   231  			case <-h.abort:
   232  				return
   233  			case <-tickChannel:
   234  				deadline := time.Now().Add(websocket.WriteWait)
   235  				logger.Tracef("ping logsink %p", socket)
   236  				if err := socket.WriteControl(gorillaws.PingMessage, []byte{}, deadline); err != nil {
   237  					// This error is expected if the other end goes away. By
   238  					// returning we clean up the strategy and close the socket
   239  					// through the defer calls.
   240  					logger.Debugf("failed to write ping: %s", err)
   241  					// Bump the ping failure count.
   242  					h.metrics.PingFailureCount(resolvedModelUUID).Inc()
   243  					return
   244  				}
   245  			case m, ok := <-logCh:
   246  				if !ok {
   247  					h.mu.Lock()
   248  					defer h.mu.Unlock()
   249  					h.receiverStopped = true
   250  					return
   251  				}
   252  
   253  				if err := writer.WriteLog(m); err != nil {
   254  					h.sendError(socket, req, err)
   255  					// Increment the number of failure cases per modelUUID, that
   256  					// we where unable to write a log to - note: we won't see
   257  					// why the failure happens, only that it did happen. Maybe
   258  					// we should add a trace log here. Developer mode for send
   259  					// error might help if it was enabled at first ?
   260  					h.metrics.LogWriteCount(resolvedModelUUID, metricLogWriteLabelFailure).Inc()
   261  					return
   262  				}
   263  
   264  				// Increment the number of successful modelUUID log writes, so
   265  				// that we can see what's a success over failure case
   266  				h.metrics.LogWriteCount(resolvedModelUUID, metricLogWriteLabelSuccess).Inc()
   267  			}
   268  		}
   269  	}
   270  	websocket.Serve(w, req, handler)
   271  }
   272  
   273  func (h *logSinkHandler) getVersion(req *http.Request) (int, error) {
   274  	verStr := req.URL.Query().Get("version")
   275  	switch verStr {
   276  	case "":
   277  		return 0, nil
   278  	case "1":
   279  		return 1, nil
   280  	default:
   281  		return 0, errors.Errorf("unknown version %q", verStr)
   282  	}
   283  }
   284  
   285  func (h *logSinkHandler) receiveLogs(socket *websocket.Conn,
   286  	endpointVersion int,
   287  	resolvedModelUUID string,
   288  	stop <-chan struct{},
   289  ) <-chan params.LogRecord {
   290  	logCh := make(chan params.LogRecord)
   291  
   292  	var tokenBucket *ratelimit.Bucket
   293  	if h.ratelimit != nil {
   294  		tokenBucket = ratelimit.NewBucketWithClock(
   295  			h.ratelimit.Refill,
   296  			h.ratelimit.Burst,
   297  			ratelimitClock{h.ratelimit.Clock},
   298  		)
   299  	}
   300  
   301  	go func() {
   302  		// Close the channel to signal ServeHTTP to finish. Otherwise
   303  		// we leak goroutines on client disconnect, because the server
   304  		// isn't shutting down so h.abort is never closed.
   305  		defer close(logCh)
   306  		var m params.LogRecord
   307  		for {
   308  			// Receive() blocks until data arrives but will also be
   309  			// unblocked when the API handler calls socket.Close as it
   310  			// finishes.
   311  			if err := socket.ReadJSON(&m); err != nil {
   312  				if gorillaws.IsUnexpectedCloseError(err, gorillaws.CloseNormalClosure, gorillaws.CloseGoingAway) {
   313  					logger.Debugf("logsink receive error: %v", err)
   314  					h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelError).Inc()
   315  				} else {
   316  					logger.Debugf("disconnected, %p", socket)
   317  					h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelDisconnect).Inc()
   318  				}
   319  				// Try to tell the other end we are closing. If the other end
   320  				// has already disconnected from us, this will fail, but we don't
   321  				// care that much.
   322  				h.mu.Lock()
   323  				defer h.mu.Unlock()
   324  				socket.WriteMessage(gorillaws.CloseMessage, []byte{})
   325  				return
   326  			}
   327  
   328  			// Rate-limit receipt of log messages. We rate-limit
   329  			// each connection individually to prevent one noisy
   330  			// individual from drowning out the others.
   331  			if tokenBucket != nil {
   332  				if d := tokenBucket.Take(1); d > 0 {
   333  					select {
   334  					case <-h.ratelimit.Clock.After(d):
   335  					case <-h.abort:
   336  						return
   337  					}
   338  				}
   339  			}
   340  
   341  			// Send the log message.
   342  			select {
   343  			case <-h.abort:
   344  				// The API server is stopping.
   345  				return
   346  			case <-stop:
   347  				// The ServeHTTP handler has stopped.
   348  				return
   349  			case logCh <- m:
   350  				// If the remote end does not support ping/pong, we bump
   351  				// the read deadline everytime a message is received.
   352  				if endpointVersion == 0 {
   353  					socket.SetReadDeadline(time.Now().Add(vZeroDelay))
   354  				}
   355  			}
   356  		}
   357  	}()
   358  
   359  	return logCh
   360  }
   361  
   362  // sendError sends a JSON-encoded error response.
   363  func (h *logSinkHandler) sendError(ws *websocket.Conn, req *http.Request, err error) {
   364  	// There is no need to log the error for normal operators as there is nothing
   365  	// they can action. This is for developers.
   366  	if err != nil && featureflag.Enabled(feature.DeveloperMode) {
   367  		logger.Errorf("returning error from %s %s: %s", req.Method, req.URL.Path, errors.Details(err))
   368  	}
   369  	h.mu.Lock()
   370  	defer h.mu.Unlock()
   371  	if sendErr := ws.SendInitialErrorV0(err); sendErr != nil {
   372  		logger.Errorf("closing websocket, %v", err)
   373  		ws.Close()
   374  	}
   375  }
   376  
   377  // JujuClientVersionFromRequest returns the Juju client version
   378  // number from the HTTP request.
   379  func JujuClientVersionFromRequest(req *http.Request) (version.Number, error) {
   380  	verStr := req.URL.Query().Get("jujuclientversion")
   381  	if verStr == "" {
   382  		return version.Zero, errors.New(`missing "jujuclientversion" in URL query`)
   383  	}
   384  	ver, err := version.Parse(verStr)
   385  	if err != nil {
   386  		return version.Zero, errors.Annotatef(err, "invalid jujuclientversion %q", verStr)
   387  	}
   388  	return ver, nil
   389  }
   390  
   391  // ratelimitClock adapts clock.Clock to ratelimit.Clock.
   392  type ratelimitClock struct {
   393  	clock.Clock
   394  }
   395  
   396  // Sleep is defined by the ratelimit.Clock interface.
   397  func (c ratelimitClock) Sleep(d time.Duration) {
   398  	<-c.Clock.After(d)
   399  }