github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/database/node.go (about) 1 // Copyright 2022 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package database 5 6 import ( 7 "context" 8 "crypto/tls" 9 "crypto/x509" 10 "fmt" 11 "io" 12 "net" 13 "os" 14 "path" 15 "path/filepath" 16 "strconv" 17 "strings" 18 "time" 19 20 "github.com/juju/collections/transform" 21 "github.com/juju/errors" 22 "github.com/juju/loggo" 23 "gopkg.in/yaml.v3" 24 25 "github.com/juju/juju/agent" 26 coredatabase "github.com/juju/juju/core/database" 27 corenetwork "github.com/juju/juju/core/network" 28 "github.com/juju/juju/database/app" 29 "github.com/juju/juju/database/client" 30 "github.com/juju/juju/database/dqlite" 31 "github.com/juju/juju/network" 32 ) 33 34 const ( 35 dqliteBootstrapBindIP = "127.0.0.1" 36 dqliteDataDir = "dqlite" 37 dqlitePort = 17666 38 dqliteClusterFileName = "cluster.yaml" 39 ) 40 41 // NodeManager is responsible for interrogating a single Dqlite node, 42 // and emitting configuration for starting its Dqlite `App` based on 43 // operational requirements and controller agent config. 44 type NodeManager struct { 45 cfg agent.Config 46 port int 47 isLoopbackPreferred bool 48 logger Logger 49 slowQueryLogger coredatabase.SlowQueryLogger 50 51 dataDir string 52 } 53 54 // NewNodeManager returns a new NodeManager reference 55 // based on the input agent configuration. 56 // 57 // If isLoopbackPreferred is true, we bind Dqlite to 127.0.0.1 and eschew TLS 58 // termination. This is useful primarily in unit testing and a temporary 59 // workaround for CAAS, which does not yet support enable-ha. 60 // 61 // If it is false, we attempt to identify a unique local-cloud address. 62 // If we find one, we use it as the bind address. Otherwise, we fall back 63 // to the loopback binding. 64 func NewNodeManager(cfg agent.Config, isLoopbackPreferred bool, logger Logger, slowQueryLogger coredatabase.SlowQueryLogger) *NodeManager { 65 m := &NodeManager{ 66 cfg: cfg, 67 port: dqlitePort, 68 isLoopbackPreferred: isLoopbackPreferred, 69 logger: logger, 70 slowQueryLogger: slowQueryLogger, 71 } 72 if cfg != nil { 73 if port, ok := cfg.DqlitePort(); ok { 74 m.port = port 75 } 76 } 77 return m 78 } 79 80 // IsLoopbackPreferred returns true if we should prefer to bind Dqlite 81 // to the loopback IP address. 82 // This is currently true for CAAS and unit testing. Once CAAS supports 83 // enable-ha we'll have to revisit this. 84 func (m *NodeManager) IsLoopbackPreferred() bool { 85 return m.isLoopbackPreferred 86 } 87 88 // IsLoopbackBound returns true if we are a cluster of one, 89 // and bound to the loopback IP address. 90 func (m *NodeManager) IsLoopbackBound(ctx context.Context) (bool, error) { 91 extant, err := m.IsExistingNode() 92 if err != nil { 93 return false, errors.Annotate(err, "determining existing Dqlite node") 94 } 95 if !extant { 96 return false, nil 97 } 98 99 servers, err := m.ClusterServers(ctx) 100 if err != nil { 101 return false, errors.Trace(err) 102 } 103 104 if len(servers) != 1 { 105 return false, nil 106 } 107 108 return strings.HasPrefix(servers[0].Address, dqliteBootstrapBindIP), nil 109 } 110 111 // IsExistingNode returns true if this machine or container has 112 // ever started a Dqlite `App` before. Specifically, this is whether 113 // the Dqlite data directory is empty. 114 func (m *NodeManager) IsExistingNode() (bool, error) { 115 if _, err := m.EnsureDataDir(); err != nil { 116 return false, errors.Annotate(err, "ensuring Dqlite data directory") 117 } 118 119 dir, err := os.Open(m.dataDir) 120 if err != nil { 121 return false, errors.Annotate(err, "opening Dqlite data directory") 122 } 123 124 _, err = dir.Readdirnames(1) 125 switch err { 126 case nil: 127 return true, nil 128 case io.EOF: 129 return false, nil 130 default: 131 return false, errors.Annotate(err, "reading Dqlite data directory") 132 } 133 } 134 135 // EnsureDataDir ensures that a directory for Dqlite data exists at 136 // a path determined by the agent config, then returns that path. 137 func (m *NodeManager) EnsureDataDir() (string, error) { 138 if m.dataDir == "" { 139 dir := filepath.Join(m.cfg.DataDir(), dqliteDataDir) 140 if err := os.MkdirAll(dir, 0700); err != nil { 141 return "", errors.Annotatef(err, "creating directory for Dqlite data") 142 } 143 m.dataDir = dir 144 } 145 return m.dataDir, nil 146 } 147 148 // SetClusterToLocalNode reconfigures the Dqlite cluster so that it has the 149 // local node as its only member. 150 // This is intended as a disaster recovery utility, and should only be called: 151 // 1. At great need. 152 // 2. With steadfast guarantees of data integrity. 153 func (m *NodeManager) SetClusterToLocalNode(ctx context.Context) error { 154 node, err := m.NodeInfo() 155 if err != nil { 156 return errors.Trace(err) 157 } 158 return errors.Trace(m.SetClusterServers(ctx, []dqlite.NodeInfo{node})) 159 } 160 161 // ClusterServers returns the node information for 162 // Dqlite nodes configured to be in the cluster. 163 func (m *NodeManager) ClusterServers(ctx context.Context) ([]dqlite.NodeInfo, error) { 164 store, err := m.nodeClusterStore() 165 if err != nil { 166 return nil, errors.Trace(err) 167 } 168 servers, err := store.Get(ctx) 169 return servers, errors.Annotate(err, "retrieving servers from Dqlite node store") 170 } 171 172 // SetClusterServers reconfigures the Dqlite cluster by writing the 173 // input servers to Dqlite's Raft log and the local node YAML store. 174 // This should only be called on a stopped Dqlite node. 175 func (m *NodeManager) SetClusterServers(ctx context.Context, servers []dqlite.NodeInfo) error { 176 store, err := m.nodeClusterStore() 177 if err != nil { 178 return errors.Trace(err) 179 } 180 181 if err := dqlite.ReconfigureMembership(m.dataDir, servers); err != nil { 182 return errors.Annotate(err, "reconfiguring Dqlite cluster membership") 183 } 184 185 return errors.Annotate(store.Set(ctx, servers), "writing servers to Dqlite node store") 186 } 187 188 // NodeInfo reads the local node information file in the Dqlite directory 189 // and returns the dqlite.NodeInfo represented by its contents. 190 func (m *NodeManager) NodeInfo() (dqlite.NodeInfo, error) { 191 var node dqlite.NodeInfo 192 193 data, err := os.ReadFile(path.Join(m.dataDir, "info.yaml")) 194 if err != nil { 195 return node, errors.Annotate(err, "reading info.yaml") 196 } 197 198 err = yaml.Unmarshal(data, &node) 199 return node, errors.Annotate(err, "decoding NodeInfo") 200 } 201 202 // SetNodeInfo rewrites the local node information file in the Dqlite 203 // data directory, so that it matches the input NodeInfo. 204 // This should only be called on a stopped Dqlite node. 205 func (m *NodeManager) SetNodeInfo(server dqlite.NodeInfo) error { 206 data, err := yaml.Marshal(server) 207 if err != nil { 208 return errors.Annotatef(err, "marshalling NodeInfo %#v", server) 209 } 210 return errors.Annotatef( 211 os.WriteFile(path.Join(m.dataDir, "info.yaml"), data, 0600), "writing info.yaml to %s", m.dataDir) 212 } 213 214 // WithLogFuncOption returns a Dqlite application Option that will proxy Dqlite 215 // log output via this factory's logger where the level is recognised. 216 func (m *NodeManager) WithLogFuncOption() app.Option { 217 if m.cfg.QueryTracingEnabled() { 218 return app.WithLogFunc(m.slowQueryLogFunc(m.cfg.QueryTracingThreshold())) 219 } 220 return app.WithLogFunc(m.appLogFunc) 221 } 222 223 // WithTracingOption returns a Dqlite application Option that will enable 224 // tracing of Dqlite queries. 225 func (m *NodeManager) WithTracingOption() app.Option { 226 if m.cfg.QueryTracingEnabled() { 227 return app.WithTracing(client.LogWarn) 228 } 229 return app.WithTracing(client.LogNone) 230 } 231 232 // WithPreferredCloudLocalAddressOption uses the input network config source to 233 // return a local-cloud address to which to bind Dqlite, provided that a unique 234 // one can be determined. 235 // If there are zero or multiple local-cloud addresses detected on the host, 236 // we fall back to binding to the loopback address. 237 // This method is only relevant to bootstrap. At all other times (such as when 238 // joining a cluster) the bind address is determined externally and passed as 239 // the argument to WithAddressOption. 240 func (m *NodeManager) WithPreferredCloudLocalAddressOption(source corenetwork.ConfigSource) (app.Option, error) { 241 nics, err := source.Interfaces() 242 if err != nil { 243 return nil, errors.Annotate(err, "querying local network interfaces") 244 } 245 246 var addrs corenetwork.MachineAddresses 247 for _, nic := range nics { 248 name := nic.Name() 249 if nic.Type() == corenetwork.LoopbackDevice || 250 name == network.DefaultLXDBridge || 251 name == network.DefaultKVMBridge || 252 name == network.DefaultDockerBridge { 253 continue 254 } 255 256 sysAddrs, err := nic.Addresses() 257 if err != nil || len(sysAddrs) == 0 { 258 continue 259 } 260 261 for _, addr := range sysAddrs { 262 addrs = append(addrs, corenetwork.NewMachineAddress(addr.IP().String())) 263 } 264 } 265 266 cloudLocal := addrs.AllMatchingScope(corenetwork.ScopeMatchCloudLocal).Values() 267 if len(cloudLocal) == 1 { 268 return m.WithAddressOption(cloudLocal[0]), nil 269 } 270 271 m.logger.Warningf("failed to determine a unique local-cloud address; falling back to 127.0.0.1 for Dqlite") 272 return m.WithLoopbackAddressOption(), nil 273 } 274 275 // WithLoopbackAddressOption returns a Dqlite application 276 // Option that will bind Dqlite to the loopback IP. 277 func (m *NodeManager) WithLoopbackAddressOption() app.Option { 278 return m.WithAddressOption(dqliteBootstrapBindIP) 279 } 280 281 // WithAddressOption returns a Dqlite application Option 282 // for specifying the local address:port to use. 283 func (m *NodeManager) WithAddressOption(ip string) app.Option { 284 // dqlite expects an ipv6 address to be in square brackets 285 // e.g. [::1]:1234 so we need to use net.JoinHostPort. 286 return app.WithAddress(net.JoinHostPort(ip, strconv.Itoa(m.port))) 287 } 288 289 // WithTLSOption returns a Dqlite application Option for TLS encryption 290 // of traffic between clients and clustered application nodes. 291 func (m *NodeManager) WithTLSOption() (app.Option, error) { 292 stateInfo, ok := m.cfg.StateServingInfo() 293 if !ok { 294 return nil, errors.NotSupportedf("Dqlite node initialisation on non-controller machine/container") 295 } 296 297 caCertPool := x509.NewCertPool() 298 caCertPool.AppendCertsFromPEM([]byte(m.cfg.CACert())) 299 300 controllerCert, err := tls.X509KeyPair([]byte(stateInfo.Cert), []byte(stateInfo.PrivateKey)) 301 if err != nil { 302 return nil, errors.Annotate(err, "parsing controller certificate") 303 } 304 305 listen := &tls.Config{ 306 ClientCAs: caCertPool, 307 Certificates: []tls.Certificate{controllerCert}, 308 } 309 310 dial := &tls.Config{ 311 RootCAs: caCertPool, 312 Certificates: []tls.Certificate{controllerCert}, 313 // We cannot provide a ServerName value here, so we rely on the 314 // server validating the controller's client certificate. 315 InsecureSkipVerify: true, 316 } 317 318 return app.WithTLS(listen, dial), nil 319 } 320 321 // WithClusterOption returns a Dqlite application Option for initialising 322 // Dqlite as the member of a cluster with peers representing other controllers. 323 func (m *NodeManager) WithClusterOption(addrs []string) app.Option { 324 peerAddrs := transform.Slice(addrs, func(addr string) string { 325 return fmt.Sprintf("%s:%d", addr, m.port) 326 }) 327 328 m.logger.Debugf("determined Dqlite cluster members: %v", peerAddrs) 329 return app.WithCluster(peerAddrs) 330 } 331 332 // nodeClusterStore returns a YamlNodeStore instance based 333 // on the cluster.yaml file in the Dqlite data directory. 334 func (m *NodeManager) nodeClusterStore() (*client.YamlNodeStore, error) { 335 store, err := client.NewYamlNodeStore(path.Join(m.dataDir, dqliteClusterFileName)) 336 return store, errors.Annotate(err, "opening Dqlite cluster node store") 337 } 338 339 func (m *NodeManager) slowQueryLogFunc(threshold time.Duration) client.LogFunc { 340 return func(level client.LogLevel, msg string, args ...interface{}) { 341 if level != client.LogWarn { 342 m.appLogFunc(level, msg, args...) 343 return 344 } 345 346 // If we're tracing the dqlite logs we only want to log slow queries 347 // and not all the debug messages. 348 queryType, duration, stmt := parseSlowQuery(msg, args, threshold) 349 switch queryType { 350 case slowQuery: 351 m.slowQueryLogger.RecordSlowQuery(msg, stmt, args, duration) 352 case normalQuery: 353 m.appLogFunc(level, msg, args...) 354 default: 355 // This is a slow query, but we shouldn't report it. 356 } 357 } 358 } 359 360 func (m *NodeManager) appLogFunc(level client.LogLevel, msg string, args ...interface{}) { 361 actualLevel, known := loggo.ParseLevel(level.String()) 362 if !known { 363 return 364 } 365 366 m.logger.Logf(actualLevel, msg, args...) 367 } 368 369 // QueryType represents the type of query that is being sent. This simplifies 370 // the logic for determining if a query is slow or not and if it should be 371 // reported. 372 type queryType int 373 374 const ( 375 normalQuery queryType = iota 376 slowQuery 377 ignoreSlowQuery 378 ) 379 380 // This is highly dependent on the format of the log message, which is 381 // not ideal, but it's the only way to get the query string out of the 382 // log message. This potentially breaks if the dqlite library changes the 383 // format of the log message. It would be better if the dqlite library 384 // provided a way to get traces from a request that wasn't tied to the logging 385 // system. 386 // 387 // The timed queries logged to the tracing request are for the whole time the 388 // query is being processed. This includes the network time, along with the 389 // time performing the sqlite query. If the node is sensitive to latency, then 390 // it will show up here, even though the query itself might be fast at the 391 // sqlite level. 392 // 393 // Raw log messages will be in the form: 394 // 395 // - "%.3fs request query: %q" 396 // - "%.3fs request exec: %q" 397 // - "%.3fs request prepared: %q" 398 // 399 // It is expected that each log message will have 2 arguments, the first being 400 // the duration of the query in seconds as a float64. The second being the query 401 // performed as a string. 402 func parseSlowQuery(msg string, args []any, slowQueryThreshold time.Duration) (queryType, float64, string) { 403 if len(args) != 2 { 404 return normalQuery, 0, "" 405 } 406 407 // We're not a slow query if the message doesn't match the expected format. 408 if !strings.HasPrefix(msg, "%.3fs request ") { 409 return normalQuery, 0, "" 410 } 411 412 // Validate that the first argument is a float64. 413 var duration float64 414 switch t := args[0].(type) { 415 case float64: 416 duration = t 417 default: 418 return normalQuery, 0, "" 419 } 420 421 var stmt string 422 switch t := args[1].(type) { 423 case string: 424 stmt = t 425 default: 426 return normalQuery, 0, "" 427 } 428 429 if duration >= slowQueryThreshold.Seconds() { 430 return slowQuery, duration, stmt 431 } 432 433 return ignoreSlowQuery, duration, stmt 434 }