github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/apiserver/logsink/logsink.go (about) 1 // Copyright 2015-2017 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package logsink 5 6 import ( 7 "io" 8 "net/http" 9 "sync" 10 "time" 11 12 gorillaws "github.com/gorilla/websocket" 13 "github.com/juju/clock" 14 "github.com/juju/errors" 15 "github.com/juju/featureflag" 16 "github.com/juju/loggo" 17 "github.com/juju/ratelimit" 18 "github.com/prometheus/client_golang/prometheus" 19 20 "github.com/juju/juju/apiserver/httpcontext" 21 "github.com/juju/juju/apiserver/websocket" 22 "github.com/juju/juju/feature" 23 "github.com/juju/juju/rpc/params" 24 ) 25 26 const ( 27 metricLogWriteLabelSuccess = "success" 28 metricLogWriteLabelFailure = "failure" 29 ) 30 31 const ( 32 metricLogReadLabelSuccess = "success" 33 metricLogReadLabelError = "error" 34 metricLogReadLabelDisconnect = "disconnect" 35 ) 36 37 var logger = loggo.GetLogger("juju.apiserver.logsink") 38 39 // LogWriteCloser provides an interface for persisting log records. 40 // The LogCloser's Close method should be called to release any 41 // resources once it is done with. 42 type LogWriteCloser interface { 43 io.Closer 44 45 // WriteLog writes out the given log record. 46 WriteLog(params.LogRecord) error 47 } 48 49 // NewLogWriteCloserFunc returns a new LogWriteCloser for the given http.Request. 50 type NewLogWriteCloserFunc func(*http.Request) (LogWriteCloser, error) 51 52 // RateLimitConfig contains the rate-limit configuration for the logsink 53 // handler. 54 type RateLimitConfig struct { 55 // Burst is the number of log messages that will be let through before 56 // we start rate limiting. 57 Burst int64 58 59 // Refill is the rate at which log messages will be let through once 60 // the initial burst amount has been depleted. 61 Refill time.Duration 62 63 // Clock is the clock used to wait when rate-limiting log receives. 64 Clock clock.Clock 65 } 66 67 // CounterVec is a Collector that bundles a set of Counters that all share the 68 // same description. 69 type CounterVec interface { 70 // With returns a Counter for a given labels slice 71 With(prometheus.Labels) prometheus.Counter 72 } 73 74 // GaugeVec is a Collector that bundles a set of Gauges that all share the 75 // same description. 76 type GaugeVec interface { 77 // With returns a Gauge for a given labels slice 78 With(prometheus.Labels) prometheus.Gauge 79 } 80 81 // MetricsCollector represents a way to change the metrics for the logsink 82 // api handler. 83 // 84 //go:generate go run go.uber.org/mock/mockgen -package mocks -destination mocks/metrics_collector_mock.go github.com/juju/juju/apiserver/logsink MetricsCollector 85 //go:generate go run go.uber.org/mock/mockgen -package mocks -destination mocks/metrics_mock.go github.com/prometheus/client_golang/prometheus Counter,Gauge 86 type MetricsCollector interface { 87 88 // TotalConnections returns a prometheus metric that can be incremented 89 // as a counter for the total number connections being served from the api 90 // handler. 91 TotalConnections() prometheus.Counter 92 93 // Connections returns a prometheus metric that can be incremented and 94 // decremented as a gauge, for the number connections being current served 95 // from the api handler. 96 Connections() prometheus.Gauge 97 98 // PingFailureCount returns a prometheus metric for the number of 99 // ping failures per model uuid, that can be incremented as 100 // a counter. 101 PingFailureCount(modelUUID string) prometheus.Counter 102 103 // LogWriteCount returns a prometheus metric for the number of writes to 104 // the log that happened. It's split on the success/failure, so the charts 105 // will have to take that into account. 106 LogWriteCount(modelUUID, state string) prometheus.Counter 107 108 // LogReadCount returns a prometheus metric for the number of reads to 109 // the log that happened. It's split on the success/error/disconnect, so 110 // the charts will have to take that into account. 111 LogReadCount(modelUUID, state string) prometheus.Counter 112 } 113 114 // NewHTTPHandler returns a new http.Handler for receiving log messages over a 115 // websocket, using the given NewLogWriteCloserFunc to obtain a writer to which 116 // the log messages will be written. 117 // 118 // ratelimit defines an optional rate-limit configuration. If nil, no rate- 119 // limiting will be applied. 120 func NewHTTPHandler( 121 newLogWriteCloser NewLogWriteCloserFunc, 122 abort <-chan struct{}, 123 ratelimit *RateLimitConfig, 124 metrics MetricsCollector, 125 modelUUID string, 126 ) http.Handler { 127 return &logSinkHandler{ 128 newLogWriteCloser: newLogWriteCloser, 129 abort: abort, 130 ratelimit: ratelimit, 131 newStopChannel: func() (chan struct{}, func()) { 132 ch := make(chan struct{}) 133 return ch, func() { close(ch) } 134 }, 135 metrics: metrics, 136 modelUUID: modelUUID, 137 } 138 } 139 140 type logSinkHandler struct { 141 newLogWriteCloser NewLogWriteCloserFunc 142 abort <-chan struct{} 143 ratelimit *RateLimitConfig 144 metrics MetricsCollector 145 modelUUID string 146 mu sync.Mutex 147 148 // newStopChannel is overridden in tests so that we can check the 149 // goroutine exits when prompted. 150 newStopChannel func() (chan struct{}, func()) 151 receiverStopped bool 152 } 153 154 // Since the logsink only receives messages, it is possible for the other end 155 // to disappear without the server noticing. To fix this, we use the 156 // underlying websocket control messages ping/pong. Periodically the server 157 // writes a ping, and the other end replies with a pong. Now the tricky bit is 158 // that it appears in all the examples found on the interweb that it is 159 // possible for the control message to be sent successfully to something that 160 // isn't entirely alive, which is why relying on an error return from the 161 // write call is insufficient to mark the connection as dead. Instead the 162 // write and read deadlines inherent in the underlying Go networking libraries 163 // are used to force errors on timeouts. However the underlying network 164 // libraries use time.Now() to determine whether or not to send errors, so 165 // using a testing clock here isn't going to work. So we rely on manual 166 // testing, and what is defined as good practice by the library authors. 167 // 168 // Now, in theory, we should be using this ping/pong across all the websockets, 169 // but that is a little outside the scope of this piece of work. 170 171 const ( 172 // For endpoints that don't support ping/pong (i.e. agents prior to 2.2-beta1) 173 // we will time out their connections after six hours of inactivity. 174 vZeroDelay = 6 * time.Hour 175 ) 176 177 // ServeHTTP implements the http.Handler interface. 178 func (h *logSinkHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { 179 // If the modelUUID from the request is empty, fallback to the one in 180 // the logsink handler (controller UUID) 181 resolvedModelUUID := h.modelUUID 182 if modelUUID := httpcontext.RequestModelUUID(req); modelUUID != "" { 183 resolvedModelUUID = modelUUID 184 } 185 186 handler := func(socket *websocket.Conn) { 187 h.metrics.TotalConnections().Inc() 188 h.metrics.Connections().Inc() 189 defer h.metrics.Connections().Dec() 190 191 defer socket.Close() 192 endpointVersion, err := h.getVersion(req) 193 if err != nil { 194 h.sendError(socket, req, err) 195 return 196 } 197 writer, err := h.newLogWriteCloser(req) 198 if err != nil { 199 h.sendError(socket, req, err) 200 return 201 } 202 defer writer.Close() 203 204 // If we get to here, no more errors to report, so we report a nil 205 // error. This way the first line of the socket is always a json 206 // formatted simple error. 207 h.sendError(socket, req, nil) 208 209 // Here we configure the ping/pong handling for the websocket so the 210 // server can notice when the client goes away. Older versions did not 211 // respond to ping control messages, so don't try. 212 var tickChannel <-chan time.Time 213 if endpointVersion > 0 { 214 _ = socket.SetReadDeadline(time.Now().Add(websocket.PongDelay)) 215 socket.SetPongHandler(func(string) error { 216 logger.Tracef("pong logsink %p", socket) 217 _ = socket.SetReadDeadline(time.Now().Add(websocket.PongDelay)) 218 return nil 219 }) 220 ticker := time.NewTicker(websocket.PingPeriod) 221 defer ticker.Stop() 222 tickChannel = ticker.C 223 } else { 224 _ = socket.SetReadDeadline(time.Now().Add(vZeroDelay)) 225 } 226 227 stopReceiving, closer := h.newStopChannel() 228 defer closer() 229 logCh := h.receiveLogs(socket, endpointVersion, resolvedModelUUID, stopReceiving) 230 for { 231 select { 232 case <-h.abort: 233 return 234 case <-tickChannel: 235 deadline := time.Now().Add(websocket.WriteWait) 236 logger.Tracef("ping logsink %p", socket) 237 if err := socket.WriteControl(gorillaws.PingMessage, []byte{}, deadline); err != nil { 238 // This error is expected if the other end goes away. By 239 // returning we clean up the strategy and close the socket 240 // through the defer calls. 241 logger.Debugf("failed to write ping: %s", err) 242 // Bump the ping failure count. 243 h.metrics.PingFailureCount(resolvedModelUUID).Inc() 244 return 245 } 246 case m, ok := <-logCh: 247 if !ok { 248 h.mu.Lock() 249 defer h.mu.Unlock() 250 h.receiverStopped = true 251 return 252 } 253 254 if err := writer.WriteLog(m); err != nil { 255 h.sendError(socket, req, err) 256 // Increment the number of failure cases per modelUUID, that 257 // we where unable to write a log to - note: we won't see 258 // why the failure happens, only that it did happen. Maybe 259 // we should add a trace log here. Developer mode for send 260 // error might help if it was enabled at first ? 261 h.metrics.LogWriteCount(resolvedModelUUID, metricLogWriteLabelFailure).Inc() 262 return 263 } 264 265 // Increment the number of successful modelUUID log writes, so 266 // that we can see what's a success over failure case 267 h.metrics.LogWriteCount(resolvedModelUUID, metricLogWriteLabelSuccess).Inc() 268 } 269 } 270 } 271 websocket.Serve(w, req, handler) 272 } 273 274 func (h *logSinkHandler) getVersion(req *http.Request) (int, error) { 275 verStr := req.URL.Query().Get("version") 276 switch verStr { 277 case "": 278 return 0, nil 279 case "1": 280 return 1, nil 281 default: 282 return 0, errors.Errorf("unknown version %q", verStr) 283 } 284 } 285 286 func (h *logSinkHandler) receiveLogs(socket *websocket.Conn, 287 endpointVersion int, 288 resolvedModelUUID string, 289 stop <-chan struct{}, 290 ) <-chan params.LogRecord { 291 logCh := make(chan params.LogRecord) 292 293 var tokenBucket *ratelimit.Bucket 294 if h.ratelimit != nil { 295 tokenBucket = ratelimit.NewBucketWithClock( 296 h.ratelimit.Refill, 297 h.ratelimit.Burst, 298 ratelimitClock{h.ratelimit.Clock}, 299 ) 300 } 301 302 go func() { 303 // Close the channel to signal ServeHTTP to finish. Otherwise 304 // we leak goroutines on client disconnect, because the server 305 // isn't shutting down so h.abort is never closed. 306 defer close(logCh) 307 for { 308 // Receive() blocks until data arrives but will also be 309 // unblocked when the API handler calls socket.Close as it 310 // finishes. 311 312 // Do not lift the LogRecord outside of the for-loop as any fields 313 // with omitempty will not get cleared between iterations. The 314 // logsink has to work with different versions of juju we need to 315 // ensure that we have consistent data, even at the cost of 316 // performance. 317 var m params.LogRecord 318 if err := socket.ReadJSON(&m); err != nil { 319 if gorillaws.IsCloseError(err, gorillaws.CloseNormalClosure, gorillaws.CloseGoingAway) { 320 logger.Tracef("logsink closed: %v", err) 321 h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelDisconnect).Inc() 322 } else if gorillaws.IsUnexpectedCloseError(err, gorillaws.CloseNormalClosure, gorillaws.CloseGoingAway) { 323 logger.Debugf("logsink unexpected close error: %v", err) 324 h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelError).Inc() 325 } else { 326 logger.Debugf("logsink error: %v", err) 327 h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelError).Inc() 328 } 329 // Try to tell the other end we are closing. If the other end 330 // has already disconnected from us, this will fail, but we don't 331 // care that much. 332 h.mu.Lock() 333 defer h.mu.Unlock() 334 _ = socket.WriteMessage(gorillaws.CloseMessage, []byte{}) 335 return 336 } 337 h.metrics.LogReadCount(resolvedModelUUID, metricLogReadLabelSuccess).Inc() 338 339 // Rate-limit receipt of log messages. We rate-limit 340 // each connection individually to prevent one noisy 341 // individual from drowning out the others. 342 if tokenBucket != nil { 343 if d := tokenBucket.Take(1); d > 0 { 344 select { 345 case <-h.ratelimit.Clock.After(d): 346 case <-h.abort: 347 return 348 } 349 } 350 } 351 352 // Send the log message. 353 select { 354 case <-h.abort: 355 // The API server is stopping. 356 return 357 case <-stop: 358 // The ServeHTTP handler has stopped. 359 return 360 case logCh <- m: 361 // If the remote end does not support ping/pong, we bump 362 // the read deadline everytime a message is received. 363 if endpointVersion == 0 { 364 _ = socket.SetReadDeadline(time.Now().Add(vZeroDelay)) 365 } 366 } 367 } 368 }() 369 370 return logCh 371 } 372 373 // sendError sends a JSON-encoded error response. 374 func (h *logSinkHandler) sendError(ws *websocket.Conn, req *http.Request, err error) { 375 // There is no need to log the error for normal operators as there is nothing 376 // they can action. This is for developers. 377 if err != nil && featureflag.Enabled(feature.DeveloperMode) { 378 logger.Errorf("returning error from %s %s: %s", req.Method, req.URL.Path, errors.Details(err)) 379 } 380 h.mu.Lock() 381 defer h.mu.Unlock() 382 if sendErr := ws.SendInitialErrorV0(err); sendErr != nil { 383 logger.Errorf("closing websocket, %v", err) 384 ws.Close() 385 } 386 } 387 388 // ratelimitClock adapts clock.Clock to ratelimit.Clock. 389 type ratelimitClock struct { 390 clock.Clock 391 } 392 393 // Sleep is defined by the ratelimit.Clock interface. 394 func (c ratelimitClock) Sleep(d time.Duration) { 395 <-c.Clock.After(d) 396 }