github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/database/node.go (about)

     1  // Copyright 2022 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package database
     5  
     6  import (
     7  	"context"
     8  	"crypto/tls"
     9  	"crypto/x509"
    10  	"fmt"
    11  	"io"
    12  	"net"
    13  	"os"
    14  	"path"
    15  	"path/filepath"
    16  	"strconv"
    17  	"strings"
    18  	"time"
    19  
    20  	"github.com/juju/collections/transform"
    21  	"github.com/juju/errors"
    22  	"github.com/juju/loggo"
    23  	"gopkg.in/yaml.v3"
    24  
    25  	"github.com/juju/juju/agent"
    26  	coredatabase "github.com/juju/juju/core/database"
    27  	corenetwork "github.com/juju/juju/core/network"
    28  	"github.com/juju/juju/database/app"
    29  	"github.com/juju/juju/database/client"
    30  	"github.com/juju/juju/database/dqlite"
    31  	"github.com/juju/juju/network"
    32  )
    33  
    34  const (
    35  	dqliteBootstrapBindIP = "127.0.0.1"
    36  	dqliteDataDir         = "dqlite"
    37  	dqlitePort            = 17666
    38  	dqliteClusterFileName = "cluster.yaml"
    39  )
    40  
    41  // NodeManager is responsible for interrogating a single Dqlite node,
    42  // and emitting configuration for starting its Dqlite `App` based on
    43  // operational requirements and controller agent config.
    44  type NodeManager struct {
    45  	cfg                 agent.Config
    46  	port                int
    47  	isLoopbackPreferred bool
    48  	logger              Logger
    49  	slowQueryLogger     coredatabase.SlowQueryLogger
    50  
    51  	dataDir string
    52  }
    53  
    54  // NewNodeManager returns a new NodeManager reference
    55  // based on the input agent configuration.
    56  //
    57  // If isLoopbackPreferred is true, we bind Dqlite to 127.0.0.1 and eschew TLS
    58  // termination. This is useful primarily in unit testing and a temporary
    59  // workaround for CAAS, which does not yet support enable-ha.
    60  //
    61  // If it is false, we attempt to identify a unique local-cloud address.
    62  // If we find one, we use it as the bind address. Otherwise, we fall back
    63  // to the loopback binding.
    64  func NewNodeManager(cfg agent.Config, isLoopbackPreferred bool, logger Logger, slowQueryLogger coredatabase.SlowQueryLogger) *NodeManager {
    65  	m := &NodeManager{
    66  		cfg:                 cfg,
    67  		port:                dqlitePort,
    68  		isLoopbackPreferred: isLoopbackPreferred,
    69  		logger:              logger,
    70  		slowQueryLogger:     slowQueryLogger,
    71  	}
    72  	if cfg != nil {
    73  		if port, ok := cfg.DqlitePort(); ok {
    74  			m.port = port
    75  		}
    76  	}
    77  	return m
    78  }
    79  
    80  // IsLoopbackPreferred returns true if we should prefer to bind Dqlite
    81  // to the loopback IP address.
    82  // This is currently true for CAAS and unit testing. Once CAAS supports
    83  // enable-ha we'll have to revisit this.
    84  func (m *NodeManager) IsLoopbackPreferred() bool {
    85  	return m.isLoopbackPreferred
    86  }
    87  
    88  // IsLoopbackBound returns true if we are a cluster of one,
    89  // and bound to the loopback IP address.
    90  func (m *NodeManager) IsLoopbackBound(ctx context.Context) (bool, error) {
    91  	extant, err := m.IsExistingNode()
    92  	if err != nil {
    93  		return false, errors.Annotate(err, "determining existing Dqlite node")
    94  	}
    95  	if !extant {
    96  		return false, nil
    97  	}
    98  
    99  	servers, err := m.ClusterServers(ctx)
   100  	if err != nil {
   101  		return false, errors.Trace(err)
   102  	}
   103  
   104  	if len(servers) != 1 {
   105  		return false, nil
   106  	}
   107  
   108  	return strings.HasPrefix(servers[0].Address, dqliteBootstrapBindIP), nil
   109  }
   110  
   111  // IsExistingNode returns true if this machine or container has
   112  // ever started a Dqlite `App` before. Specifically, this is whether
   113  // the Dqlite data directory is empty.
   114  func (m *NodeManager) IsExistingNode() (bool, error) {
   115  	if _, err := m.EnsureDataDir(); err != nil {
   116  		return false, errors.Annotate(err, "ensuring Dqlite data directory")
   117  	}
   118  
   119  	dir, err := os.Open(m.dataDir)
   120  	if err != nil {
   121  		return false, errors.Annotate(err, "opening Dqlite data directory")
   122  	}
   123  
   124  	_, err = dir.Readdirnames(1)
   125  	switch err {
   126  	case nil:
   127  		return true, nil
   128  	case io.EOF:
   129  		return false, nil
   130  	default:
   131  		return false, errors.Annotate(err, "reading Dqlite data directory")
   132  	}
   133  }
   134  
   135  // EnsureDataDir ensures that a directory for Dqlite data exists at
   136  // a path determined by the agent config, then returns that path.
   137  func (m *NodeManager) EnsureDataDir() (string, error) {
   138  	if m.dataDir == "" {
   139  		dir := filepath.Join(m.cfg.DataDir(), dqliteDataDir)
   140  		if err := os.MkdirAll(dir, 0700); err != nil {
   141  			return "", errors.Annotatef(err, "creating directory for Dqlite data")
   142  		}
   143  		m.dataDir = dir
   144  	}
   145  	return m.dataDir, nil
   146  }
   147  
   148  // SetClusterToLocalNode reconfigures the Dqlite cluster so that it has the
   149  // local node as its only member.
   150  // This is intended as a disaster recovery utility, and should only be called:
   151  // 1. At great need.
   152  // 2. With steadfast guarantees of data integrity.
   153  func (m *NodeManager) SetClusterToLocalNode(ctx context.Context) error {
   154  	node, err := m.NodeInfo()
   155  	if err != nil {
   156  		return errors.Trace(err)
   157  	}
   158  	return errors.Trace(m.SetClusterServers(ctx, []dqlite.NodeInfo{node}))
   159  }
   160  
   161  // ClusterServers returns the node information for
   162  // Dqlite nodes configured to be in the cluster.
   163  func (m *NodeManager) ClusterServers(ctx context.Context) ([]dqlite.NodeInfo, error) {
   164  	store, err := m.nodeClusterStore()
   165  	if err != nil {
   166  		return nil, errors.Trace(err)
   167  	}
   168  	servers, err := store.Get(ctx)
   169  	return servers, errors.Annotate(err, "retrieving servers from Dqlite node store")
   170  }
   171  
   172  // SetClusterServers reconfigures the Dqlite cluster by writing the
   173  // input servers to Dqlite's Raft log and the local node YAML store.
   174  // This should only be called on a stopped Dqlite node.
   175  func (m *NodeManager) SetClusterServers(ctx context.Context, servers []dqlite.NodeInfo) error {
   176  	store, err := m.nodeClusterStore()
   177  	if err != nil {
   178  		return errors.Trace(err)
   179  	}
   180  
   181  	if err := dqlite.ReconfigureMembership(m.dataDir, servers); err != nil {
   182  		return errors.Annotate(err, "reconfiguring Dqlite cluster membership")
   183  	}
   184  
   185  	return errors.Annotate(store.Set(ctx, servers), "writing servers to Dqlite node store")
   186  }
   187  
   188  // NodeInfo reads the local node information file in the Dqlite directory
   189  // and returns the dqlite.NodeInfo represented by its contents.
   190  func (m *NodeManager) NodeInfo() (dqlite.NodeInfo, error) {
   191  	var node dqlite.NodeInfo
   192  
   193  	data, err := os.ReadFile(path.Join(m.dataDir, "info.yaml"))
   194  	if err != nil {
   195  		return node, errors.Annotate(err, "reading info.yaml")
   196  	}
   197  
   198  	err = yaml.Unmarshal(data, &node)
   199  	return node, errors.Annotate(err, "decoding NodeInfo")
   200  }
   201  
   202  // SetNodeInfo rewrites the local node information file in the Dqlite
   203  // data directory, so that it matches the input NodeInfo.
   204  // This should only be called on a stopped Dqlite node.
   205  func (m *NodeManager) SetNodeInfo(server dqlite.NodeInfo) error {
   206  	data, err := yaml.Marshal(server)
   207  	if err != nil {
   208  		return errors.Annotatef(err, "marshalling NodeInfo %#v", server)
   209  	}
   210  	return errors.Annotatef(
   211  		os.WriteFile(path.Join(m.dataDir, "info.yaml"), data, 0600), "writing info.yaml to %s", m.dataDir)
   212  }
   213  
   214  // WithLogFuncOption returns a Dqlite application Option that will proxy Dqlite
   215  // log output via this factory's logger where the level is recognised.
   216  func (m *NodeManager) WithLogFuncOption() app.Option {
   217  	if m.cfg.QueryTracingEnabled() {
   218  		return app.WithLogFunc(m.slowQueryLogFunc(m.cfg.QueryTracingThreshold()))
   219  	}
   220  	return app.WithLogFunc(m.appLogFunc)
   221  }
   222  
   223  // WithTracingOption returns a Dqlite application Option that will enable
   224  // tracing of Dqlite queries.
   225  func (m *NodeManager) WithTracingOption() app.Option {
   226  	if m.cfg.QueryTracingEnabled() {
   227  		return app.WithTracing(client.LogWarn)
   228  	}
   229  	return app.WithTracing(client.LogNone)
   230  }
   231  
   232  // WithPreferredCloudLocalAddressOption uses the input network config source to
   233  // return a local-cloud address to which to bind Dqlite, provided that a unique
   234  // one can be determined.
   235  // If there are zero or multiple local-cloud addresses detected on the host,
   236  // we fall back to binding to the loopback address.
   237  // This method is only relevant to bootstrap. At all other times (such as when
   238  // joining a cluster) the bind address is determined externally and passed as
   239  // the argument to WithAddressOption.
   240  func (m *NodeManager) WithPreferredCloudLocalAddressOption(source corenetwork.ConfigSource) (app.Option, error) {
   241  	nics, err := source.Interfaces()
   242  	if err != nil {
   243  		return nil, errors.Annotate(err, "querying local network interfaces")
   244  	}
   245  
   246  	var addrs corenetwork.MachineAddresses
   247  	for _, nic := range nics {
   248  		name := nic.Name()
   249  		if nic.Type() == corenetwork.LoopbackDevice ||
   250  			name == network.DefaultLXDBridge ||
   251  			name == network.DefaultKVMBridge ||
   252  			name == network.DefaultDockerBridge {
   253  			continue
   254  		}
   255  
   256  		sysAddrs, err := nic.Addresses()
   257  		if err != nil || len(sysAddrs) == 0 {
   258  			continue
   259  		}
   260  
   261  		for _, addr := range sysAddrs {
   262  			addrs = append(addrs, corenetwork.NewMachineAddress(addr.IP().String()))
   263  		}
   264  	}
   265  
   266  	cloudLocal := addrs.AllMatchingScope(corenetwork.ScopeMatchCloudLocal).Values()
   267  	if len(cloudLocal) == 1 {
   268  		return m.WithAddressOption(cloudLocal[0]), nil
   269  	}
   270  
   271  	m.logger.Warningf("failed to determine a unique local-cloud address; falling back to 127.0.0.1 for Dqlite")
   272  	return m.WithLoopbackAddressOption(), nil
   273  }
   274  
   275  // WithLoopbackAddressOption returns a Dqlite application
   276  // Option that will bind Dqlite to the loopback IP.
   277  func (m *NodeManager) WithLoopbackAddressOption() app.Option {
   278  	return m.WithAddressOption(dqliteBootstrapBindIP)
   279  }
   280  
   281  // WithAddressOption returns a Dqlite application Option
   282  // for specifying the local address:port to use.
   283  func (m *NodeManager) WithAddressOption(ip string) app.Option {
   284  	// dqlite expects an ipv6 address to be in square brackets
   285  	// e.g. [::1]:1234 so we need to use net.JoinHostPort.
   286  	return app.WithAddress(net.JoinHostPort(ip, strconv.Itoa(m.port)))
   287  }
   288  
   289  // WithTLSOption returns a Dqlite application Option for TLS encryption
   290  // of traffic between clients and clustered application nodes.
   291  func (m *NodeManager) WithTLSOption() (app.Option, error) {
   292  	stateInfo, ok := m.cfg.StateServingInfo()
   293  	if !ok {
   294  		return nil, errors.NotSupportedf("Dqlite node initialisation on non-controller machine/container")
   295  	}
   296  
   297  	caCertPool := x509.NewCertPool()
   298  	caCertPool.AppendCertsFromPEM([]byte(m.cfg.CACert()))
   299  
   300  	controllerCert, err := tls.X509KeyPair([]byte(stateInfo.Cert), []byte(stateInfo.PrivateKey))
   301  	if err != nil {
   302  		return nil, errors.Annotate(err, "parsing controller certificate")
   303  	}
   304  
   305  	listen := &tls.Config{
   306  		ClientCAs:    caCertPool,
   307  		Certificates: []tls.Certificate{controllerCert},
   308  	}
   309  
   310  	dial := &tls.Config{
   311  		RootCAs:      caCertPool,
   312  		Certificates: []tls.Certificate{controllerCert},
   313  		// We cannot provide a ServerName value here, so we rely on the
   314  		// server validating the controller's client certificate.
   315  		InsecureSkipVerify: true,
   316  	}
   317  
   318  	return app.WithTLS(listen, dial), nil
   319  }
   320  
   321  // WithClusterOption returns a Dqlite application Option for initialising
   322  // Dqlite as the member of a cluster with peers representing other controllers.
   323  func (m *NodeManager) WithClusterOption(addrs []string) app.Option {
   324  	peerAddrs := transform.Slice(addrs, func(addr string) string {
   325  		return fmt.Sprintf("%s:%d", addr, m.port)
   326  	})
   327  
   328  	m.logger.Debugf("determined Dqlite cluster members: %v", peerAddrs)
   329  	return app.WithCluster(peerAddrs)
   330  }
   331  
   332  // nodeClusterStore returns a YamlNodeStore instance based
   333  // on the cluster.yaml file in the Dqlite data directory.
   334  func (m *NodeManager) nodeClusterStore() (*client.YamlNodeStore, error) {
   335  	store, err := client.NewYamlNodeStore(path.Join(m.dataDir, dqliteClusterFileName))
   336  	return store, errors.Annotate(err, "opening Dqlite cluster node store")
   337  }
   338  
   339  func (m *NodeManager) slowQueryLogFunc(threshold time.Duration) client.LogFunc {
   340  	return func(level client.LogLevel, msg string, args ...interface{}) {
   341  		if level != client.LogWarn {
   342  			m.appLogFunc(level, msg, args...)
   343  			return
   344  		}
   345  
   346  		// If we're tracing the dqlite logs we only want to log slow queries
   347  		// and not all the debug messages.
   348  		queryType, duration, stmt := parseSlowQuery(msg, args, threshold)
   349  		switch queryType {
   350  		case slowQuery:
   351  			m.slowQueryLogger.RecordSlowQuery(msg, stmt, args, duration)
   352  		case normalQuery:
   353  			m.appLogFunc(level, msg, args...)
   354  		default:
   355  			// This is a slow query, but we shouldn't report it.
   356  		}
   357  	}
   358  }
   359  
   360  func (m *NodeManager) appLogFunc(level client.LogLevel, msg string, args ...interface{}) {
   361  	actualLevel, known := loggo.ParseLevel(level.String())
   362  	if !known {
   363  		return
   364  	}
   365  
   366  	m.logger.Logf(actualLevel, msg, args...)
   367  }
   368  
   369  // QueryType represents the type of query that is being sent. This simplifies
   370  // the logic for determining if a query is slow or not and if it should be
   371  // reported.
   372  type queryType int
   373  
   374  const (
   375  	normalQuery queryType = iota
   376  	slowQuery
   377  	ignoreSlowQuery
   378  )
   379  
   380  // This is highly dependent on the format of the log message, which is
   381  // not ideal, but it's the only way to get the query string out of the
   382  // log message. This potentially breaks if the dqlite library changes the
   383  // format of the log message. It would be better if the dqlite library
   384  // provided a way to get traces from a request that wasn't tied to the logging
   385  // system.
   386  //
   387  // The timed queries logged to the tracing request are for the whole time the
   388  // query is being processed. This includes the network time, along with the
   389  // time performing the sqlite query. If the node is sensitive to latency, then
   390  // it will show up here, even though the query itself might be fast at the
   391  // sqlite level.
   392  //
   393  // Raw log messages will be in the form:
   394  //
   395  //   - "%.3fs request query: %q"
   396  //   - "%.3fs request exec: %q"
   397  //   - "%.3fs request prepared: %q"
   398  //
   399  // It is expected that each log message will have 2 arguments, the first being
   400  // the duration of the query in seconds as a float64. The second being the query
   401  // performed as a string.
   402  func parseSlowQuery(msg string, args []any, slowQueryThreshold time.Duration) (queryType, float64, string) {
   403  	if len(args) != 2 {
   404  		return normalQuery, 0, ""
   405  	}
   406  
   407  	// We're not a slow query if the message doesn't match the expected format.
   408  	if !strings.HasPrefix(msg, "%.3fs request ") {
   409  		return normalQuery, 0, ""
   410  	}
   411  
   412  	// Validate that the first argument is a float64.
   413  	var duration float64
   414  	switch t := args[0].(type) {
   415  	case float64:
   416  		duration = t
   417  	default:
   418  		return normalQuery, 0, ""
   419  	}
   420  
   421  	var stmt string
   422  	switch t := args[1].(type) {
   423  	case string:
   424  		stmt = t
   425  	default:
   426  		return normalQuery, 0, ""
   427  	}
   428  
   429  	if duration >= slowQueryThreshold.Seconds() {
   430  		return slowQuery, duration, stmt
   431  	}
   432  
   433  	return ignoreSlowQuery, duration, stmt
   434  }