github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/apiserver/logsink/logsink.go (about) 1 // Copyright 2015-2017 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package logsink 5 6 import ( 7 "io" 8 "net/http" 9 "sync" 10 "time" 11 12 gorillaws "github.com/gorilla/websocket" 13 "github.com/juju/clock" 14 "github.com/juju/errors" 15 "github.com/juju/loggo" 16 "github.com/juju/ratelimit" 17 "github.com/juju/utils/featureflag" 18 "github.com/juju/version" 19 "github.com/prometheus/client_golang/prometheus" 20 21 "github.com/juju/juju/apiserver/httpcontext" 22 "github.com/juju/juju/apiserver/params" 23 "github.com/juju/juju/apiserver/websocket" 24 "github.com/juju/juju/feature" 25 ) 26 27 const ( 28 metricLogWriteLabelSuccess = "success" 29 metricLogWriteLabelFailure = "failure" 30 ) 31 32 const ( 33 metricLogReadLabelError = "error" 34 metricLogReadLabelDisconnect = "disconnect" 35 ) 36 37 var logger = loggo.GetLogger("juju.apiserver.logsink") 38 39 // LogWriteCloser provides an interface for persisting log records. 40 // The LogCloser's Close method should be called to release any 41 // resources once it is done with. 42 type LogWriteCloser interface { 43 io.Closer 44 45 // WriteLog writes out the given log record. 46 WriteLog(params.LogRecord) error 47 } 48 49 // NewLogWriteCloserFunc returns a new LogWriteCloser for the given http.Request. 50 type NewLogWriteCloserFunc func(*http.Request) (LogWriteCloser, error) 51 52 // RateLimitConfig contains the rate-limit configuration for the logsink 53 // handler. 54 type RateLimitConfig struct { 55 // Burst is the number of log messages that will be let through before 56 // we start rate limiting. 57 Burst int64 58 59 // Refill is the rate at which log messages will be let through once 60 // the initial burst amount has been depleted. 61 Refill time.Duration 62 63 // Clock is the clock used to wait when rate-limiting log receives. 64 Clock clock.Clock 65 } 66 67 // CounterVec is a Collector that bundles a set of Counters that all share the 68 // same description. 69 type CounterVec interface { 70 // With returns a Counter for a given labels slice 71 With(prometheus.Labels) prometheus.Counter 72 } 73 74 // GaugeVec is a Collector that bundles a set of Gauges that all share the 75 // same description. 76 type GaugeVec interface { 77 // With returns a Gauge for a given labels slice 78 With(prometheus.Labels) prometheus.Gauge 79 } 80 81 // MetricsCollector represents a way to change the metrics for the logsink 82 // api handler. 83 //go:generate mockgen -package mocks -destination mocks/metrics_collector_mock.go github.com/juju/juju/apiserver/logsink MetricsCollector 84 //go:generate mockgen -package mocks -destination mocks/metrics_mock.go github.com/prometheus/client_golang/prometheus Counter,Gauge 85 type MetricsCollector interface { 86 87 // TotalConnections returns a prometheus metric that can be incremented 88 // as a counter for the total number connections being served from the api 89 // handler. 90 TotalConnections() prometheus.Counter 91 92 // Connections returns a prometheus metric that can be incremented and 93 // decremented as a gauge, for the number connections being current served 94 // from the api handler. 95 Connections() prometheus.Gauge 96 97 // PingFailureCount returns a prometheus metric for the number of 98 // ping failures per model uuid, that can be incremented as 99 // a counter. 100 PingFailureCount(modelUUID string) prometheus.Counter 101 102 // LogWriteCount returns a prometheus metric for the number of writes to 103 // the log that happened. It's split on the success/failure, so the charts 104 // will have to take that into account. 105 LogWriteCount(modelUUID, state string) prometheus.Counter 106 107 // LogReadCount returns a prometheus metric for the number of reads to 108 // the log that happened. It's split on the success/error/disconnect, so 109 // the charts will have to take that into account. 110 LogReadCount(modelUUID, state string) prometheus.Counter 111 } 112 113 // NewHTTPHandler returns a new http.Handler for receiving log messages over a 114 // websocket, using the given NewLogWriteCloserFunc to obtain a writer to which 115 // the log messages will be written. 116 // 117 // ratelimit defines an optional rate-limit configuration. If nil, no rate- 118 // limiting will be applied. 119 func NewHTTPHandler( 120 newLogWriteCloser NewLogWriteCloserFunc, 121 abort <-chan struct{}, 122 ratelimit *RateLimitConfig, 123 metrics MetricsCollector, 124 modelUUID string, 125 ) http.Handler { 126 return &logSinkHandler{ 127 newLogWriteCloser: newLogWriteCloser, 128 abort: abort, 129 ratelimit: ratelimit, 130 newStopChannel: func() (chan struct{}, func()) { 131 ch := make(chan struct{}) 132 return ch, func() { close(ch) } 133 }, 134 metrics: metrics, 135 modelUUID: modelUUID, 136 } 137 } 138 139 type logSinkHandler struct { 140 newLogWriteCloser NewLogWriteCloserFunc 141 abort <-chan struct{} 142 ratelimit *RateLimitConfig 143 metrics MetricsCollector 144 modelUUID string 145 mu sync.Mutex 146 147 // newStopChannel is overridden in tests so that we can check the 148 // goroutine exits when prompted. 149 newStopChannel func() (chan struct{}, func()) 150 receiverStopped bool 151 } 152 153 // Since the logsink only receives messages, it is possible for the other end 154 // to disappear without the server noticing. To fix this, we use the 155 // underlying websocket control messages ping/pong. Periodically the server 156 // writes a ping, and the other end replies with a pong. Now the tricky bit is 157 // that it appears in all the examples found on the interweb that it is 158 // possible for the control message to be sent successfully to something that 159 // isn't entirely alive, which is why relying on an error return from the 160 // write call is insufficient to mark the connection as dead. Instead the 161 // write and read deadlines inherent in the underlying Go networking libraries 162 // are used to force errors on timeouts. However the underlying network 163 // libraries use time.Now() to determine whether or not to send errors, so 164 // using a testing clock here isn't going to work. So we rely on manual 165 // testing, and what is defined as good practice by the library authors. 166 // 167 // Now, in theory, we should be using this ping/pong across all the websockets, 168 // but that is a little outside the scope of this piece of work. 169 170 const ( 171 // For endpoints that don't support ping/pong (i.e. agents prior to 2.2-beta1) 172 // we will time out their connections after six hours of inactivity. 173 vZeroDelay = 6 * time.Hour 174 ) 175 176 // ServeHTTP implements the http.Handler interface. 177 func (h *logSinkHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { 178 // If the modelUUID from the request is empty, fallback to the one in 179 // the logsink handler (controller UUID) 180 resolvedModelUUID := h.modelUUID 181 if modelUUID := httpcontext.RequestModelUUID(req); modelUUID != "" { 182 resolvedModelUUID = modelUUID 183 } 184 185 handler := func(socket *websocket.Conn) { 186 h.metrics.TotalConnections().Inc() 187 h.metrics.Connections().Inc() 188 defer h.metrics.Connections().Dec() 189 190 defer socket.Close() 191 endpointVersion, err := h.getVersion(req) 192 if err != nil { 193 h.sendError(socket, req, err) 194 return 195 } 196 writer, err := h.newLogWriteCloser(req) 197 if err != nil { 198 h.sendError(socket, req, err) 199 return 200 } 201 defer writer.Close() 202 203 // If we get to here, no more errors to report, so we report a nil 204 // error. This way the first line of the socket is always a json 205 // formatted simple error. 206 h.sendError(socket, req, nil) 207 208 // Here we configure the ping/pong handling for the websocket so the 209 // server can notice when the client goes away. Older versions did not 210 // respond to ping control messages, so don't try. 211 var tickChannel <-chan time.Time 212 if endpointVersion > 0 { 213 socket.SetReadDeadline(time.Now().Add(websocket.PongDelay)) 214 socket.SetPongHandler(func(string) error { 215 logger.Tracef("pong logsink %p", socket) 216 socket.SetReadDeadline(time.Now().Add(websocket.PongDelay)) 217 return nil 218 }) 219 ticker := time.NewTicker(websocket.PingPeriod) 220 defer ticker.Stop() 221 tickChannel = ticker.C 222 } else { 223 socket.SetReadDeadline(time.Now().Add(vZeroDelay)) 224 } 225 226 stopReceiving, closer := h.newStopChannel() 227 defer closer() 228 logCh := h.receiveLogs(socket, endpointVersion, resolvedModelUUID, stopReceiving) 229 for { 230 select { 231 case <-h.abort: 232 return 233 case <-tickChannel: 234 deadline := time.Now().Add(websocket.WriteWait) 235 logger.Tracef("ping logsink %p", socket) 236 if err := socket.WriteControl(gorillaws.PingMessage, []byte{}, deadline); err != nil { 237 // This error is expected if the other end goes away. By 238 // returning we clean up the strategy and close the socket 239 // through the defer calls. 240 logger.Debugf("failed to write ping: %s", err) 241 // Bump the ping failure count. 242 h.metrics.PingFailureCount(resolvedModelUUID).Inc() 243 return 244 } 245 case m, ok := <-logCh: 246 if !ok { 247 h.mu.Lock() 248 defer h.mu.Unlock() 249 h.receiverStopped = true 250 return 251 } 252 253 if err := writer.WriteLog(m); err != nil { 254 h.sendError(socket, req, err) 255 // Increment the number of failure cases per modelUUID, that 256 // we where unable to write a log to - note: we won't see 257 // why the failure happens, only that it did happen. Maybe 258 // we should add a trace log here. Developer mode for send 259 // error might help if it was enabled at first ? 260 h.metrics.LogWriteCount(resolvedModelUUID, metricLogWriteLabelFailure).Inc() 261 return 262 } 263 264 // Increment the number of successful modelUUID log writes, so 265 // that we can see what's a success over failure case 266 h.metrics.LogWriteCount(resolvedModelUUID, metricLogWriteLabelSuccess).Inc() 267 } 268 } 269 } 270 websocket.Serve(w, req, handler) 271 } 272 273 func (h *logSinkHandler) getVersion(req *http.Request) (int, error) { 274 verStr := req.URL.Query().Get("version") 275 switch verStr { 276 case "": 277 return 0, nil 278 case "1": 279 return 1, nil 280 default: 281 return 0, errors.Errorf("unknown version %q", verStr) 282 } 283 } 284 285 func (h *logSinkHandler) receiveLogs(socket *websocket.Conn, 286 endpointVersion int, 287 resolvedModelUUID string, 288 stop <-chan struct{}, 289 ) <-chan params.LogRecord { 290 logCh := make(chan params.LogRecord) 291 292 var tokenBucket *ratelimit.Bucket 293 if h.ratelimit != nil { 294 tokenBucket = ratelimit.NewBucketWithClock( 295 h.ratelimit.Refill, 296 h.ratelimit.Burst, 297 ratelimitClock{h.ratelimit.Clock}, 298 ) 299 } 300 301 go func() { 302 // Close the channel to signal ServeHTTP to finish. Otherwise 303 // we leak goroutines on client disconnect, because the server 304 // isn't shutting down so h.abort is never closed. 305 defer close(logCh) 306 var m params.LogRecord 307 for { 308 // Receive() blocks until data arrives but will also be 309 // unblocked when the API handler calls socket.Close as it 310 // finishes. 311 if err := socket.ReadJSON(&m); err != nil { 312 if gorillaws.IsUnexpectedCloseError(err, gorillaws.CloseNormalClosure, gorillaws.CloseGoingAway) { 313 logger.Debugf("logsink receive error: %v", err) 314 h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelError).Inc() 315 } else { 316 logger.Debugf("disconnected, %p", socket) 317 h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelDisconnect).Inc() 318 } 319 // Try to tell the other end we are closing. If the other end 320 // has already disconnected from us, this will fail, but we don't 321 // care that much. 322 h.mu.Lock() 323 defer h.mu.Unlock() 324 socket.WriteMessage(gorillaws.CloseMessage, []byte{}) 325 return 326 } 327 328 // Rate-limit receipt of log messages. We rate-limit 329 // each connection individually to prevent one noisy 330 // individual from drowning out the others. 331 if tokenBucket != nil { 332 if d := tokenBucket.Take(1); d > 0 { 333 select { 334 case <-h.ratelimit.Clock.After(d): 335 case <-h.abort: 336 return 337 } 338 } 339 } 340 341 // Send the log message. 342 select { 343 case <-h.abort: 344 // The API server is stopping. 345 return 346 case <-stop: 347 // The ServeHTTP handler has stopped. 348 return 349 case logCh <- m: 350 // If the remote end does not support ping/pong, we bump 351 // the read deadline everytime a message is received. 352 if endpointVersion == 0 { 353 socket.SetReadDeadline(time.Now().Add(vZeroDelay)) 354 } 355 } 356 } 357 }() 358 359 return logCh 360 } 361 362 // sendError sends a JSON-encoded error response. 363 func (h *logSinkHandler) sendError(ws *websocket.Conn, req *http.Request, err error) { 364 // There is no need to log the error for normal operators as there is nothing 365 // they can action. This is for developers. 366 if err != nil && featureflag.Enabled(feature.DeveloperMode) { 367 logger.Errorf("returning error from %s %s: %s", req.Method, req.URL.Path, errors.Details(err)) 368 } 369 h.mu.Lock() 370 defer h.mu.Unlock() 371 if sendErr := ws.SendInitialErrorV0(err); sendErr != nil { 372 logger.Errorf("closing websocket, %v", err) 373 ws.Close() 374 } 375 } 376 377 // JujuClientVersionFromRequest returns the Juju client version 378 // number from the HTTP request. 379 func JujuClientVersionFromRequest(req *http.Request) (version.Number, error) { 380 verStr := req.URL.Query().Get("jujuclientversion") 381 if verStr == "" { 382 return version.Zero, errors.New(`missing "jujuclientversion" in URL query`) 383 } 384 ver, err := version.Parse(verStr) 385 if err != nil { 386 return version.Zero, errors.Annotatef(err, "invalid jujuclientversion %q", verStr) 387 } 388 return ver, nil 389 } 390 391 // ratelimitClock adapts clock.Clock to ratelimit.Clock. 392 type ratelimitClock struct { 393 clock.Clock 394 } 395 396 // Sleep is defined by the ratelimit.Clock interface. 397 func (c ratelimitClock) Sleep(d time.Duration) { 398 <-c.Clock.After(d) 399 }