vitess.io/vitess@v0.16.2/go/vt/vtgr/config/vtgr_config.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package config
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"net/url"
    23  	"os"
    24  	"regexp"
    25  	"strings"
    26  
    27  	"gopkg.in/gcfg.v1"
    28  
    29  	"vitess.io/vitess/go/vt/log"
    30  )
    31  
    32  // VTGRConfig is the config for VTGR
    33  type VTGRConfig struct {
    34  	DisableReadOnlyProtection   bool
    35  	BootstrapGroupSize          int
    36  	MinNumReplica               int
    37  	BackoffErrorWaitTimeSeconds int
    38  	BootstrapWaitTimeSeconds    int
    39  }
    40  
    41  var vtgrCfg = newVTGRConfig()
    42  
    43  func newVTGRConfig() *VTGRConfig {
    44  	config := &VTGRConfig{
    45  		DisableReadOnlyProtection:   false,
    46  		BootstrapGroupSize:          5,
    47  		MinNumReplica:               3,
    48  		BackoffErrorWaitTimeSeconds: 10,
    49  		BootstrapWaitTimeSeconds:    10 * 60,
    50  	}
    51  	return config
    52  }
    53  
    54  // ReadVTGRConfig reads config for VTGR
    55  func ReadVTGRConfig(file string) (*VTGRConfig, error) {
    56  	vtgrFile, err := os.Open(file)
    57  	if err != nil {
    58  		return nil, err
    59  	}
    60  	decoder := json.NewDecoder(vtgrFile)
    61  	err = decoder.Decode(vtgrCfg)
    62  	if err != nil {
    63  		return nil, err
    64  	}
    65  	return vtgrCfg, nil
    66  }
    67  
    68  /*
    69  	Everything below has been copied over from the VTOrc package
    70  */
    71  
    72  var (
    73  	envVariableRegexp = regexp.MustCompile("[$][{](.*)[}]")
    74  )
    75  
    76  const (
    77  	DefaultStatusAPIEndpoint = "/api/status"
    78  )
    79  
    80  const (
    81  	MySQLTopologyMaxPoolConnections = 3
    82  )
    83  
    84  // Configuration makes for orchestrator configuration input, which can be provided by user via JSON formatted file.
    85  // Some of the parameteres have reasonable default values, and some (like database credentials) are
    86  // strictly expected from user.
    87  // TODO(sougou): change this to yaml parsing, and possible merge with tabletenv.
    88  type Configuration struct {
    89  	Debug                                       bool   // set debug mode (similar to --debug option)
    90  	EnableSyslog                                bool   // Should logs be directed (in addition) to syslog daemon?
    91  	ListenAddress                               string // Where orchestrator HTTP should listen for TCP
    92  	ListenSocket                                string // Where orchestrator HTTP should listen for unix socket (default: empty; when given, TCP is disabled)
    93  	HTTPAdvertise                               string // optional, for raft setups, what is the HTTP address this node will advertise to its peers (potentially use where behind NAT or when rerouting ports; example: "http://11.22.33.44:3030")
    94  	AgentsServerPort                            string // port orchestrator agents talk back to
    95  	MySQLTopologyUser                           string // The user VTOrc will use to connect to MySQL instances
    96  	MySQLTopologyPassword                       string // The password VTOrc will use to connect to MySQL instances
    97  	MySQLReplicaUser                            string // User to set on replica MySQL instances while configuring replication settings on them. If set, use this credential instead of discovering from mysql. TODO(sougou): deprecate this in favor of fetching from vttablet
    98  	MySQLReplicaPassword                        string // Password to set on replica MySQL instances while configuring replication settings on them.
    99  	MySQLTopologyCredentialsConfigFile          string // my.cnf style configuration file from where to pick credentials. Expecting `user`, `password` under `[client]` section
   100  	MySQLTopologySSLPrivateKeyFile              string // Private key file used to authenticate with a Topology mysql instance with TLS
   101  	MySQLTopologySSLCertFile                    string // Certificate PEM file used to authenticate with a Topology mysql instance with TLS
   102  	MySQLTopologySSLCAFile                      string // Certificate Authority PEM file used to authenticate with a Topology mysql instance with TLS
   103  	MySQLTopologySSLSkipVerify                  bool   // If true, do not strictly validate mutual TLS certs for Topology mysql instances
   104  	MySQLTopologyUseMutualTLS                   bool   // Turn on TLS authentication with the Topology MySQL instances
   105  	MySQLTopologyUseMixedTLS                    bool   // Mixed TLS and non-TLS authentication with the Topology MySQL instances
   106  	TLSCacheTTLFactor                           uint   // Factor of InstancePollSeconds that we set as TLS info cache expiry
   107  	BackendDB                                   string // EXPERIMENTAL: type of backend db; either "mysql" or "sqlite"
   108  	SQLite3DataFile                             string // when BackendDB == "sqlite", full path to sqlite3 datafile
   109  	SkipOrchestratorDatabaseUpdate              bool   // When true, do not check backend database schema nor attempt to update it. Useful when you may be running multiple versions of orchestrator, and you only wish certain boxes to dictate the db structure (or else any time a different orchestrator version runs it will rebuild database schema)
   110  	PanicIfDifferentDatabaseDeploy              bool   // When true, and this process finds the orchestrator backend DB was provisioned by a different version, panic
   111  	RaftEnabled                                 bool   // When true, setup orchestrator in a raft consensus layout. When false (default) all Raft* variables are ignored
   112  	RaftBind                                    string
   113  	RaftAdvertise                               string
   114  	RaftDataDir                                 string
   115  	DefaultRaftPort                             int      // if a RaftNodes entry does not specify port, use this one
   116  	RaftNodes                                   []string // Raft nodes to make initial connection with
   117  	ExpectFailureAnalysisConcensus              bool
   118  	MySQLOrchestratorHost                       string
   119  	MySQLOrchestratorMaxPoolConnections         int // The maximum size of the connection pool to the Orchestrator backend.
   120  	MySQLOrchestratorPort                       uint
   121  	MySQLOrchestratorDatabase                   string
   122  	MySQLOrchestratorUser                       string
   123  	MySQLOrchestratorPassword                   string
   124  	MySQLOrchestratorCredentialsConfigFile      string   // my.cnf style configuration file from where to pick credentials. Expecting `user`, `password` under `[client]` section
   125  	MySQLOrchestratorSSLPrivateKeyFile          string   // Private key file used to authenticate with the Orchestrator mysql instance with TLS
   126  	MySQLOrchestratorSSLCertFile                string   // Certificate PEM file used to authenticate with the Orchestrator mysql instance with TLS
   127  	MySQLOrchestratorSSLCAFile                  string   // Certificate Authority PEM file used to authenticate with the Orchestrator mysql instance with TLS
   128  	MySQLOrchestratorSSLSkipVerify              bool     // If true, do not strictly validate mutual TLS certs for the Orchestrator mysql instances
   129  	MySQLOrchestratorUseMutualTLS               bool     // Turn on TLS authentication with the Orchestrator MySQL instance
   130  	MySQLOrchestratorReadTimeoutSeconds         int      // Number of seconds before backend mysql read operation is aborted (driver-side)
   131  	MySQLOrchestratorRejectReadOnly             bool     // Reject read only connections https://github.com/go-sql-driver/mysql#rejectreadonly
   132  	MySQLConnectTimeoutSeconds                  int      // Number of seconds before connection is aborted (driver-side)
   133  	MySQLDiscoveryReadTimeoutSeconds            int      // Number of seconds before topology mysql read operation is aborted (driver-side). Used for discovery queries.
   134  	MySQLTopologyReadTimeoutSeconds             int      // Number of seconds before topology mysql read operation is aborted (driver-side). Used for all but discovery queries.
   135  	MySQLConnectionLifetimeSeconds              int      // Number of seconds the mysql driver will keep database connection alive before recycling it
   136  	DefaultInstancePort                         int      // In case port was not specified on command line
   137  	ReplicationLagQuery                         string   // custom query to check on replica lg (e.g. heartbeat table). Must return a single row with a single numeric column, which is the lag.
   138  	ReplicationCredentialsQuery                 string   // custom query to get replication credentials. Must return a single row, with two text columns: 1st is username, 2nd is password. This is optional, and can be used by orchestrator to configure replication after primary takeover or setup of co-primary. You need to ensure the orchestrator user has the privileges to run this query
   139  	DiscoverByShowSlaveHosts                    bool     // Attempt SHOW SLAVE HOSTS before PROCESSLIST
   140  	UseSuperReadOnly                            bool     // Should orchestrator super_read_only any time it sets read_only
   141  	InstancePollSeconds                         uint     // Number of seconds between instance reads
   142  	InstanceWriteBufferSize                     int      // Instance write buffer size (max number of instances to flush in one INSERT ODKU)
   143  	BufferInstanceWrites                        bool     // Set to 'true' for write-optimization on backend table (compromise: writes can be stale and overwrite non stale data)
   144  	InstanceFlushIntervalMilliseconds           int      // Max interval between instance write buffer flushes
   145  	UnseenInstanceForgetHours                   uint     // Number of hours after which an unseen instance is forgotten
   146  	SnapshotTopologiesIntervalHours             uint     // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled)
   147  	DiscoveryMaxConcurrency                     uint     // Number of goroutines doing hosts discovery
   148  	DiscoveryQueueCapacity                      uint     // Buffer size of the discovery queue. Should be greater than the number of DB instances being discovered
   149  	DiscoveryQueueMaxStatisticsSize             int      // The maximum number of individual secondly statistics taken of the discovery queue
   150  	DiscoveryCollectionRetentionSeconds         uint     // Number of seconds to retain the discovery collection information
   151  	DiscoverySeeds                              []string // Hard coded array of hostname:port, ensuring orchestrator discovers these hosts upon startup, assuming not already known to orchestrator
   152  	InstanceBulkOperationsWaitTimeoutSeconds    uint     // Time to wait on a single instance when doing bulk (many instances) operation
   153  	HostnameResolveMethod                       string   // Method by which to "normalize" hostname ("none"/"default"/"cname")
   154  	MySQLHostnameResolveMethod                  string   // Method by which to "normalize" hostname via MySQL server. ("none"/"@@hostname"/"@@report_host"; default "@@hostname")
   155  	SkipBinlogServerUnresolveCheck              bool     // Skip the double-check that an unresolved hostname resolves back to same hostname for binlog servers
   156  	ExpiryHostnameResolvesMinutes               int      // Number of minutes after which to expire hostname-resolves
   157  	RejectHostnameResolvePattern                string   // Regexp pattern for resolved hostname that will not be accepted (not cached, not written to db). This is done to avoid storing wrong resolves due to network glitches.
   158  	ReasonableReplicationLagSeconds             int      // Above this value is considered a problem
   159  	ProblemIgnoreHostnameFilters                []string // Will minimize problem visualization for hostnames matching given regexp filters
   160  	VerifyReplicationFilters                    bool     // Include replication filters check before approving topology refactoring
   161  	ReasonableMaintenanceReplicationLagSeconds  int      // Above this value move-up and move-below are blocked
   162  	CandidateInstanceExpireMinutes              uint     // Minutes after which a suggestion to use an instance as a candidate replica (to be preferably promoted on primary failover) is expired.
   163  	AuditLogFile                                string   // Name of log file for audit operations. Disabled when empty.
   164  	AuditToSyslog                               bool     // If true, audit messages are written to syslog
   165  	AuditToBackendDB                            bool     // If true, audit messages are written to the backend DB's `audit` table (default: true)
   166  	AuditPurgeDays                              uint     // Days after which audit entries are purged from the database
   167  	RemoveTextFromHostnameDisplay               string   // Text to strip off the hostname on cluster/clusters pages
   168  	ReadOnly                                    bool
   169  	AuthenticationMethod                        string // Type of autherntication to use, if any. "" for none, "basic" for BasicAuth, "multi" for advanced BasicAuth, "proxy" for forwarded credentials via reverse proxy, "token" for token based access
   170  	OAuthClientID                               string
   171  	OAuthClientSecret                           string
   172  	OAuthScopes                                 []string
   173  	HTTPAuthUser                                string            // Username for HTTP Basic authentication (blank disables authentication)
   174  	HTTPAuthPassword                            string            // Password for HTTP Basic authentication
   175  	AuthUserHeader                              string            // HTTP header indicating auth user, when AuthenticationMethod is "proxy"
   176  	PowerAuthUsers                              []string          // On AuthenticationMethod == "proxy", list of users that can make changes. All others are read-only.
   177  	PowerAuthGroups                             []string          // list of unix groups the authenticated user must be a member of to make changes.
   178  	AccessTokenUseExpirySeconds                 uint              // Time by which an issued token must be used
   179  	AccessTokenExpiryMinutes                    uint              // Time after which HTTP access token expires
   180  	ClusterNameToAlias                          map[string]string // map between regex matching cluster name to a human friendly alias
   181  	DetectClusterAliasQuery                     string            // Optional query (executed on topology instance) that returns the alias of a cluster. Query will only be executed on cluster primary (though until the topology's primary is resovled it may execute on other/all replicas). If provided, must return one row, one column
   182  	DetectClusterDomainQuery                    string            // Optional query (executed on topology instance) that returns the VIP/CNAME/Alias/whatever domain name for the primary of this cluster. Query will only be executed on cluster primary (though until the topology's primary is resovled it may execute on other/all replicas). If provided, must return one row, one column
   183  	DetectInstanceAliasQuery                    string            // Optional query (executed on topology instance) that returns the alias of an instance. If provided, must return one row, one column
   184  	DetectPromotionRuleQuery                    string            // Optional query (executed on topology instance) that returns the promotion rule of an instance. If provided, must return one row, one column.
   185  	DataCenterPattern                           string            // Regexp pattern with one group, extracting the datacenter name from the hostname
   186  	RegionPattern                               string            // Regexp pattern with one group, extracting the region name from the hostname
   187  	PhysicalEnvironmentPattern                  string            // Regexp pattern with one group, extracting physical environment info from hostname (e.g. combination of datacenter & prod/dev env)
   188  	DetectDataCenterQuery                       string            // Optional query (executed on topology instance) that returns the data center of an instance. If provided, must return one row, one column. Overrides DataCenterPattern and useful for installments where DC cannot be inferred by hostname
   189  	DetectRegionQuery                           string            // Optional query (executed on topology instance) that returns the region of an instance. If provided, must return one row, one column. Overrides RegionPattern and useful for installments where Region cannot be inferred by hostname
   190  	DetectPhysicalEnvironmentQuery              string            // Optional query (executed on topology instance) that returns the physical environment of an instance. If provided, must return one row, one column. Overrides PhysicalEnvironmentPattern and useful for installments where env cannot be inferred by hostname
   191  	DetectSemiSyncEnforcedQuery                 string            // Optional query (executed on topology instance) to determine whether semi-sync is fully enforced for primary writes (async fallback is not allowed under any circumstance). If provided, must return one row, one column, value 0 or 1.
   192  	SupportFuzzyPoolHostnames                   bool              // Should "submit-pool-instances" command be able to pass list of fuzzy instances (fuzzy means non-fqdn, but unique enough to recognize). Defaults 'true', implies more queries on backend db
   193  	InstancePoolExpiryMinutes                   uint              // Time after which entries in database_instance_pool are expired (resubmit via `submit-pool-instances`)
   194  	PromotionIgnoreHostnameFilters              []string          // Orchestrator will not promote replicas with hostname matching pattern (via -c recovery; for example, avoid promoting dev-dedicated machines)
   195  	ServeAgentsHTTP                             bool              // Spawn another HTTP interface dedicated for orchestrator-agent
   196  	AgentsUseSSL                                bool              // When "true" orchestrator will listen on agents port with SSL as well as connect to agents via SSL
   197  	AgentsUseMutualTLS                          bool              // When "true" Use mutual TLS for the server to agent communication
   198  	AgentSSLSkipVerify                          bool              // When using SSL for the Agent, should we ignore SSL certification error
   199  	AgentSSLPrivateKeyFile                      string            // Name of Agent SSL private key file, applies only when AgentsUseSSL = true
   200  	AgentSSLCertFile                            string            // Name of Agent SSL certification file, applies only when AgentsUseSSL = true
   201  	AgentSSLCAFile                              string            // Name of the Agent Certificate Authority file, applies only when AgentsUseSSL = true
   202  	AgentSSLValidOUs                            []string          // Valid organizational units when using mutual TLS to communicate with the agents
   203  	UseSSL                                      bool              // Use SSL on the server web port
   204  	UseMutualTLS                                bool              // When "true" Use mutual TLS for the server's web and API connections
   205  	SSLSkipVerify                               bool              // When using SSL, should we ignore SSL certification error
   206  	SSLPrivateKeyFile                           string            // Name of SSL private key file, applies only when UseSSL = true
   207  	SSLCertFile                                 string            // Name of SSL certification file, applies only when UseSSL = true
   208  	SSLCAFile                                   string            // Name of the Certificate Authority file, applies only when UseSSL = true
   209  	SSLValidOUs                                 []string          // Valid organizational units when using mutual TLS
   210  	StatusEndpoint                              string            // Override the status endpoint.  Defaults to '/api/status'
   211  	StatusOUVerify                              bool              // If true, try to verify OUs when Mutual TLS is on.  Defaults to false
   212  	AgentPollMinutes                            uint              // Minutes between agent polling
   213  	UnseenAgentForgetHours                      uint              // Number of hours after which an unseen agent is forgotten
   214  	StaleSeedFailMinutes                        uint              // Number of minutes after which a stale (no progress) seed is considered failed.
   215  	SeedAcceptableBytesDiff                     int64             // Difference in bytes between seed source & target data size that is still considered as successful copy
   216  	SeedWaitSecondsBeforeSend                   int64             // Number of seconds for waiting before start send data command on agent
   217  	BinlogEventsChunkSize                       int               // Chunk size (X) for SHOW BINLOG|RELAYLOG EVENTS LIMIT ?,X statements. Smaller means less locking and mroe work to be done
   218  	ReduceReplicationAnalysisCount              bool              // When true, replication analysis will only report instances where possibility of handled problems is possible in the first place (e.g. will not report most leaf nodes, that are mostly uninteresting). When false, provides an entry for every known instance
   219  	FailureDetectionPeriodBlockMinutes          int               // The time for which an instance's failure discovery is kept "active", so as to avoid concurrent "discoveries" of the instance's failure; this preceeds any recovery process, if any.
   220  	RecoveryPeriodBlockMinutes                  int               // (supported for backwards compatibility but please use newer `RecoveryPeriodBlockSeconds` instead) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping
   221  	RecoveryPeriodBlockSeconds                  int               // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping
   222  	RecoveryIgnoreHostnameFilters               []string          // Recovery analysis will completely ignore hosts matching given patterns
   223  	RecoverPrimaryClusterFilters                []string          // Only do primary recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything)
   224  	RecoverIntermediatePrimaryClusterFilters    []string          // Only do IM recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything)
   225  	ProcessesShellCommand                       string            // Shell that executes command scripts
   226  	OnFailureDetectionProcesses                 []string          // Processes to execute when detecting a failover scenario (before making a decision whether to failover or not). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {autoPrimaryRecovery}, {autoIntermediatePrimaryRecovery}
   227  	PreFailoverProcesses                        []string          // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterDomain}, {failedPort}, {countReplicas}, {replicaHosts}, {isDowntimed}
   228  	PostFailoverProcesses                       []string          // Processes to execute after doing a failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}, {countLostReplicas}
   229  	PostUnsuccessfulFailoverProcesses           []string          // Processes to execute after a not-completely-successful failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}, {countLostReplicas}
   230  	PostPrimaryFailoverProcesses                []string          // Processes to execute after doing a primary failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
   231  	PostIntermediatePrimaryFailoverProcesses    []string          // Processes to execute after doing a primary failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
   232  	PostTakePrimaryProcesses                    []string          // Processes to execute after a successful Take-Primary event has taken place
   233  	CoPrimaryRecoveryMustPromoteOtherCoPrimary  bool              // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-primary or else fail
   234  	DetachLostReplicasAfterPrimaryFailover      bool              // Should replicas that are not to be lost in primary recovery (i.e. were more up-to-date than promoted replica) be forcibly detached
   235  	ApplyMySQLPromotionAfterPrimaryFailover     bool              // Should orchestrator take upon itself to apply MySQL primary promotion: set read_only=0, detach replication, etc.
   236  	PreventCrossDataCenterPrimaryFailover       bool              // When true (default: false), cross-DC primary failover are not allowed, orchestrator will do all it can to only fail over within same DC, or else not fail over at all.
   237  	PreventCrossRegionPrimaryFailover           bool              // When true (default: false), cross-region primary failover are not allowed, orchestrator will do all it can to only fail over within same region, or else not fail over at all.
   238  	PrimaryFailoverLostInstancesDowntimeMinutes uint              // Number of minutes to downtime any server that was lost after a primary failover (including failed primary & lost replicas). 0 to disable
   239  	PrimaryFailoverDetachReplicaPrimaryHost     bool              // Should orchestrator issue a detach-replica-primary-host on newly promoted primary (this makes sure the new primary will not attempt to replicate old primary if that comes back to life). Defaults 'false'. Meaningless if ApplyMySQLPromotionAfterPrimaryFailover is 'true'.
   240  	FailPrimaryPromotionOnLagMinutes            uint              // when > 0, fail a primary promotion if the candidate replica is lagging >= configured number of minutes.
   241  	FailPrimaryPromotionIfSQLThreadNotUpToDate  bool              // when true, and a primary failover takes place, if candidate primary has not consumed all relay logs, promotion is aborted with error
   242  	DelayPrimaryPromotionIfSQLThreadNotUpToDate bool              // when true, and a primary failover takes place, if candidate primary has not consumed all relay logs, delay promotion until the sql thread has caught up
   243  	PostponeReplicaRecoveryOnLagMinutes         uint              // On crash recovery, replicas that are lagging more than given minutes are only resurrected late in the recovery process, after primary/IM has been elected and processes executed. Value of 0 disables this feature
   244  	OSCIgnoreHostnameFilters                    []string          // OSC replicas recommendation will ignore replica hostnames matching given patterns
   245  	URLPrefix                                   string            // URL prefix to run orchestrator on non-root web path, e.g. /orchestrator to put it behind nginx.
   246  	DiscoveryIgnoreReplicaHostnameFilters       []string          // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps
   247  	DiscoveryIgnorePrimaryHostnameFilters       []string          // Regexp filters to apply to prevent auto-discovering a primary. Usage: pointing your primary temporarily to replicate seom data from external host
   248  	DiscoveryIgnoreHostnameFilters              []string          // Regexp filters to apply to prevent discovering instances of any kind
   249  	WebMessage                                  string            // If provided, will be shown on all web pages below the title bar
   250  	MaxConcurrentReplicaOperations              int               // Maximum number of concurrent operations on replicas
   251  	InstanceDBExecContextTimeoutSeconds         int               // Timeout on context used while calling ExecContext on instance database
   252  	LockShardTimeoutSeconds                     int               // Timeout on context used to lock shard. Should be a small value because we should fail-fast
   253  	WaitReplicasTimeoutSeconds                  int               // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockShardTimeoutSeconds since that is the total time we use for an ERS.
   254  }
   255  
   256  // ToJSONString will marshal this configuration as JSON
   257  func (config *Configuration) ToJSONString() string {
   258  	b, _ := json.Marshal(config)
   259  	return string(b)
   260  }
   261  
   262  // Config is *the* configuration instance, used globally to get configuration data
   263  var Config = newConfiguration()
   264  var readFileNames []string
   265  
   266  func newConfiguration() *Configuration {
   267  	return &Configuration{
   268  		Debug:                                       false,
   269  		EnableSyslog:                                false,
   270  		ListenAddress:                               ":3000",
   271  		ListenSocket:                                "",
   272  		HTTPAdvertise:                               "",
   273  		AgentsServerPort:                            ":3001",
   274  		StatusEndpoint:                              DefaultStatusAPIEndpoint,
   275  		StatusOUVerify:                              false,
   276  		BackendDB:                                   "sqlite",
   277  		SQLite3DataFile:                             "file::memory:?mode=memory&cache=shared",
   278  		SkipOrchestratorDatabaseUpdate:              false,
   279  		PanicIfDifferentDatabaseDeploy:              false,
   280  		RaftBind:                                    "127.0.0.1:10008",
   281  		RaftAdvertise:                               "",
   282  		RaftDataDir:                                 "",
   283  		DefaultRaftPort:                             10008,
   284  		RaftNodes:                                   []string{},
   285  		ExpectFailureAnalysisConcensus:              true,
   286  		MySQLOrchestratorMaxPoolConnections:         128, // limit concurrent conns to backend DB
   287  		MySQLOrchestratorPort:                       3306,
   288  		MySQLTopologyUseMutualTLS:                   false,
   289  		MySQLTopologyUseMixedTLS:                    true,
   290  		MySQLOrchestratorUseMutualTLS:               false,
   291  		MySQLConnectTimeoutSeconds:                  2,
   292  		MySQLOrchestratorReadTimeoutSeconds:         30,
   293  		MySQLOrchestratorRejectReadOnly:             false,
   294  		MySQLDiscoveryReadTimeoutSeconds:            10,
   295  		MySQLTopologyReadTimeoutSeconds:             600,
   296  		MySQLConnectionLifetimeSeconds:              0,
   297  		DefaultInstancePort:                         3306,
   298  		TLSCacheTTLFactor:                           100,
   299  		InstancePollSeconds:                         5,
   300  		InstanceWriteBufferSize:                     100,
   301  		BufferInstanceWrites:                        false,
   302  		InstanceFlushIntervalMilliseconds:           100,
   303  		UnseenInstanceForgetHours:                   240,
   304  		SnapshotTopologiesIntervalHours:             0,
   305  		DiscoverByShowSlaveHosts:                    false,
   306  		UseSuperReadOnly:                            false,
   307  		DiscoveryMaxConcurrency:                     300,
   308  		DiscoveryQueueCapacity:                      100000,
   309  		DiscoveryQueueMaxStatisticsSize:             120,
   310  		DiscoveryCollectionRetentionSeconds:         120,
   311  		DiscoverySeeds:                              []string{},
   312  		InstanceBulkOperationsWaitTimeoutSeconds:    10,
   313  		HostnameResolveMethod:                       "default",
   314  		MySQLHostnameResolveMethod:                  "none",
   315  		SkipBinlogServerUnresolveCheck:              true,
   316  		ExpiryHostnameResolvesMinutes:               60,
   317  		RejectHostnameResolvePattern:                "",
   318  		ReasonableReplicationLagSeconds:             10,
   319  		ProblemIgnoreHostnameFilters:                []string{},
   320  		VerifyReplicationFilters:                    false,
   321  		ReasonableMaintenanceReplicationLagSeconds:  20,
   322  		CandidateInstanceExpireMinutes:              60,
   323  		AuditLogFile:                                "",
   324  		AuditToSyslog:                               false,
   325  		AuditToBackendDB:                            false,
   326  		AuditPurgeDays:                              7,
   327  		RemoveTextFromHostnameDisplay:               "",
   328  		ReadOnly:                                    false,
   329  		AuthenticationMethod:                        "",
   330  		HTTPAuthUser:                                "",
   331  		HTTPAuthPassword:                            "",
   332  		AuthUserHeader:                              "X-Forwarded-User",
   333  		PowerAuthUsers:                              []string{"*"},
   334  		PowerAuthGroups:                             []string{},
   335  		AccessTokenUseExpirySeconds:                 60,
   336  		AccessTokenExpiryMinutes:                    1440,
   337  		ClusterNameToAlias:                          make(map[string]string),
   338  		DetectClusterAliasQuery:                     "",
   339  		DetectClusterDomainQuery:                    "",
   340  		DetectInstanceAliasQuery:                    "",
   341  		DetectPromotionRuleQuery:                    "",
   342  		DataCenterPattern:                           "",
   343  		PhysicalEnvironmentPattern:                  "",
   344  		DetectDataCenterQuery:                       "",
   345  		DetectPhysicalEnvironmentQuery:              "",
   346  		DetectSemiSyncEnforcedQuery:                 "",
   347  		SupportFuzzyPoolHostnames:                   true,
   348  		InstancePoolExpiryMinutes:                   60,
   349  		PromotionIgnoreHostnameFilters:              []string{},
   350  		ServeAgentsHTTP:                             false,
   351  		AgentsUseSSL:                                false,
   352  		AgentsUseMutualTLS:                          false,
   353  		AgentSSLValidOUs:                            []string{},
   354  		AgentSSLSkipVerify:                          false,
   355  		AgentSSLPrivateKeyFile:                      "",
   356  		AgentSSLCertFile:                            "",
   357  		AgentSSLCAFile:                              "",
   358  		UseSSL:                                      false,
   359  		UseMutualTLS:                                false,
   360  		SSLValidOUs:                                 []string{},
   361  		SSLSkipVerify:                               false,
   362  		SSLPrivateKeyFile:                           "",
   363  		SSLCertFile:                                 "",
   364  		SSLCAFile:                                   "",
   365  		AgentPollMinutes:                            60,
   366  		UnseenAgentForgetHours:                      6,
   367  		StaleSeedFailMinutes:                        60,
   368  		SeedAcceptableBytesDiff:                     8192,
   369  		SeedWaitSecondsBeforeSend:                   2,
   370  		BinlogEventsChunkSize:                       10000,
   371  		ReduceReplicationAnalysisCount:              true,
   372  		FailureDetectionPeriodBlockMinutes:          60,
   373  		RecoveryPeriodBlockMinutes:                  60,
   374  		RecoveryPeriodBlockSeconds:                  3600,
   375  		RecoveryIgnoreHostnameFilters:               []string{},
   376  		RecoverPrimaryClusterFilters:                []string{"*"},
   377  		RecoverIntermediatePrimaryClusterFilters:    []string{},
   378  		ProcessesShellCommand:                       "bash",
   379  		OnFailureDetectionProcesses:                 []string{},
   380  		PreFailoverProcesses:                        []string{},
   381  		PostPrimaryFailoverProcesses:                []string{},
   382  		PostIntermediatePrimaryFailoverProcesses:    []string{},
   383  		PostFailoverProcesses:                       []string{},
   384  		PostUnsuccessfulFailoverProcesses:           []string{},
   385  		PostTakePrimaryProcesses:                    []string{},
   386  		CoPrimaryRecoveryMustPromoteOtherCoPrimary:  true,
   387  		DetachLostReplicasAfterPrimaryFailover:      true,
   388  		ApplyMySQLPromotionAfterPrimaryFailover:     true,
   389  		PreventCrossDataCenterPrimaryFailover:       false,
   390  		PreventCrossRegionPrimaryFailover:           false,
   391  		PrimaryFailoverLostInstancesDowntimeMinutes: 0,
   392  		PrimaryFailoverDetachReplicaPrimaryHost:     false,
   393  		FailPrimaryPromotionOnLagMinutes:            0,
   394  		FailPrimaryPromotionIfSQLThreadNotUpToDate:  false,
   395  		DelayPrimaryPromotionIfSQLThreadNotUpToDate: true,
   396  		PostponeReplicaRecoveryOnLagMinutes:         0,
   397  		OSCIgnoreHostnameFilters:                    []string{},
   398  		URLPrefix:                                   "",
   399  		DiscoveryIgnoreReplicaHostnameFilters:       []string{},
   400  		WebMessage:                                  "",
   401  		MaxConcurrentReplicaOperations:              5,
   402  		InstanceDBExecContextTimeoutSeconds:         30,
   403  		LockShardTimeoutSeconds:                     30,
   404  		WaitReplicasTimeoutSeconds:                  30,
   405  	}
   406  }
   407  
   408  func (config *Configuration) postReadAdjustments() error {
   409  	if config.MySQLOrchestratorCredentialsConfigFile != "" {
   410  		mySQLConfig := struct {
   411  			Client struct {
   412  				User     string
   413  				Password string
   414  			}
   415  		}{}
   416  		err := gcfg.ReadFileInto(&mySQLConfig, config.MySQLOrchestratorCredentialsConfigFile)
   417  		if err != nil {
   418  			log.Fatalf("Failed to parse gcfg data from file: %+v", err)
   419  		} else {
   420  			log.Infof("Parsed orchestrator credentials from %s", config.MySQLOrchestratorCredentialsConfigFile)
   421  			config.MySQLOrchestratorUser = mySQLConfig.Client.User
   422  			config.MySQLOrchestratorPassword = mySQLConfig.Client.Password
   423  		}
   424  	}
   425  	{
   426  		// We accept password in the form "${SOME_ENV_VARIABLE}" in which case we pull
   427  		// the given variable from os env
   428  		submatch := envVariableRegexp.FindStringSubmatch(config.MySQLOrchestratorPassword)
   429  		if len(submatch) > 1 {
   430  			config.MySQLOrchestratorPassword = os.Getenv(submatch[1])
   431  		}
   432  	}
   433  	if config.MySQLTopologyCredentialsConfigFile != "" {
   434  		mySQLConfig := struct {
   435  			Client struct {
   436  				User     string
   437  				Password string
   438  			}
   439  		}{}
   440  		err := gcfg.ReadFileInto(&mySQLConfig, config.MySQLTopologyCredentialsConfigFile)
   441  		if err != nil {
   442  			log.Fatalf("Failed to parse gcfg data from file: %+v", err)
   443  		} else {
   444  			log.Infof("Parsed topology credentials from %s", config.MySQLTopologyCredentialsConfigFile)
   445  			config.MySQLTopologyUser = mySQLConfig.Client.User
   446  			config.MySQLTopologyPassword = mySQLConfig.Client.Password
   447  		}
   448  	}
   449  	{
   450  		// We accept password in the form "${SOME_ENV_VARIABLE}" in which case we pull
   451  		// the given variable from os env
   452  		submatch := envVariableRegexp.FindStringSubmatch(config.MySQLTopologyPassword)
   453  		if len(submatch) > 1 {
   454  			config.MySQLTopologyPassword = os.Getenv(submatch[1])
   455  		}
   456  	}
   457  
   458  	if config.RecoveryPeriodBlockSeconds == 0 && config.RecoveryPeriodBlockMinutes > 0 {
   459  		// RecoveryPeriodBlockSeconds is a newer addition that overrides RecoveryPeriodBlockMinutes
   460  		// The code does not consider RecoveryPeriodBlockMinutes anymore, but RecoveryPeriodBlockMinutes
   461  		// still supported in config file for backwards compatibility
   462  		config.RecoveryPeriodBlockSeconds = config.RecoveryPeriodBlockMinutes * 60
   463  	}
   464  
   465  	if config.FailPrimaryPromotionIfSQLThreadNotUpToDate && config.DelayPrimaryPromotionIfSQLThreadNotUpToDate {
   466  		return fmt.Errorf("Cannot have both FailPrimaryPromotionIfSQLThreadNotUpToDate and DelayPrimaryPromotionIfSQLThreadNotUpToDate enabled")
   467  	}
   468  	if config.FailPrimaryPromotionOnLagMinutes > 0 && config.ReplicationLagQuery == "" {
   469  		return fmt.Errorf("nonzero FailPrimaryPromotionOnLagMinutes requires ReplicationLagQuery to be set")
   470  	}
   471  
   472  	if config.URLPrefix != "" {
   473  		// Ensure the prefix starts with "/" and has no trailing one.
   474  		config.URLPrefix = strings.TrimLeft(config.URLPrefix, "/")
   475  		config.URLPrefix = strings.TrimRight(config.URLPrefix, "/")
   476  		config.URLPrefix = "/" + config.URLPrefix
   477  	}
   478  
   479  	if config.IsSQLite() && config.SQLite3DataFile == "" {
   480  		return fmt.Errorf("SQLite3DataFile must be set when BackendDB is sqlite")
   481  	}
   482  	if config.RaftEnabled && config.RaftDataDir == "" {
   483  		return fmt.Errorf("RaftDataDir must be defined since raft is enabled (RaftEnabled)")
   484  	}
   485  	if config.RaftEnabled && config.RaftBind == "" {
   486  		return fmt.Errorf("RaftBind must be defined since raft is enabled (RaftEnabled)")
   487  	}
   488  	if config.RaftAdvertise == "" {
   489  		config.RaftAdvertise = config.RaftBind
   490  	}
   491  	if config.HTTPAdvertise != "" {
   492  		u, err := url.Parse(config.HTTPAdvertise)
   493  		if err != nil {
   494  			return fmt.Errorf("Failed parsing HTTPAdvertise %s: %s", config.HTTPAdvertise, err.Error())
   495  		}
   496  		if u.Scheme == "" {
   497  			return fmt.Errorf("If specified, HTTPAdvertise must include scheme (http:// or https://)")
   498  		}
   499  		if u.Hostname() == "" {
   500  			return fmt.Errorf("If specified, HTTPAdvertise must include host name")
   501  		}
   502  		if u.Port() == "" {
   503  			return fmt.Errorf("If specified, HTTPAdvertise must include port number")
   504  		}
   505  		if u.Path != "" {
   506  			return fmt.Errorf("If specified, HTTPAdvertise must not specify a path")
   507  		}
   508  		if config.InstanceWriteBufferSize <= 0 {
   509  			config.BufferInstanceWrites = false
   510  		}
   511  	}
   512  	return nil
   513  }
   514  
   515  func (config *Configuration) IsSQLite() bool {
   516  	return strings.Contains(config.BackendDB, "sqlite")
   517  }
   518  
   519  func (config *Configuration) IsMySQL() bool {
   520  	return config.BackendDB == "mysql" || config.BackendDB == ""
   521  }
   522  
   523  // read reads configuration from given file, or silently skips if the file does not exist.
   524  // If the file does exist, then it is expected to be in valid JSON format or the function bails out.
   525  func read(fileName string) (*Configuration, error) {
   526  	if fileName == "" {
   527  		return Config, fmt.Errorf("Empty file name")
   528  	}
   529  	file, err := os.Open(fileName)
   530  	if err != nil {
   531  		return Config, err
   532  	}
   533  	decoder := json.NewDecoder(file)
   534  	err = decoder.Decode(Config)
   535  	if err == nil {
   536  		log.Infof("Read config: %s", fileName)
   537  	} else {
   538  		log.Fatal("Cannot read config file:", fileName, err)
   539  	}
   540  	if err := Config.postReadAdjustments(); err != nil {
   541  		log.Fatal(err)
   542  	}
   543  	return Config, err
   544  }
   545  
   546  // ForceRead reads configuration from given file name or bails out if it fails
   547  func ForceRead(fileName string) *Configuration {
   548  	_, err := read(fileName)
   549  	if err != nil {
   550  		log.Fatal("Cannot read config file:", fileName, err)
   551  	}
   552  	readFileNames = []string{fileName}
   553  	return Config
   554  }
   555  
   556  // CLIFlags stores some command line flags that are globally available in the process' lifetime
   557  type CLIFlags struct {
   558  	Noop                       *bool
   559  	SkipUnresolve              *bool
   560  	SkipUnresolveCheck         *bool
   561  	BinlogFile                 *string
   562  	GrabElection               *bool
   563  	Version                    *bool
   564  	Statement                  *string
   565  	PromotionRule              *string
   566  	ConfiguredVersion          string
   567  	SkipContinuousRegistration *bool
   568  	EnableDatabaseUpdate       *bool
   569  	IgnoreRaftSetup            *bool
   570  	Tag                        *string
   571  }
   572  
   573  var RuntimeCLIFlags CLIFlags