vitess.io/vitess@v0.16.2/go/vt/vtorc/config/config.go (about)

     1  /*
     2     Copyright 2014 Outbrain Inc.
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package config
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"os"
    23  	"time"
    24  
    25  	"github.com/spf13/pflag"
    26  
    27  	"vitess.io/vitess/go/vt/log"
    28  )
    29  
    30  const (
    31  	LostInRecoveryDowntimeSeconds int = 60 * 60 * 24 * 365
    32  )
    33  
    34  var configurationLoaded = make(chan bool)
    35  
    36  const (
    37  	HealthPollSeconds                     = 1
    38  	ActiveNodeExpireSeconds               = 5
    39  	MaintenanceOwner                      = "vtorc"
    40  	AuditPageSize                         = 20
    41  	MaintenancePurgeDays                  = 7
    42  	MaintenanceExpireMinutes              = 10
    43  	DebugMetricsIntervalSeconds           = 10
    44  	StaleInstanceCoordinatesExpireSeconds = 60
    45  	DiscoveryMaxConcurrency               = 300 // Number of goroutines doing hosts discovery
    46  	DiscoveryQueueCapacity                = 100000
    47  	DiscoveryQueueMaxStatisticsSize       = 120
    48  	DiscoveryCollectionRetentionSeconds   = 120
    49  	HostnameResolveMethod                 = "default"
    50  	UnseenInstanceForgetHours             = 240 // Number of hours after which an unseen instance is forgotten
    51  	ExpiryHostnameResolvesMinutes         = 60  // Number of minutes after which to expire hostname-resolves
    52  	CandidateInstanceExpireMinutes        = 60  // Minutes after which a suggestion to use an instance as a candidate replica (to be preferably promoted on primary failover) is expired.
    53  	FailureDetectionPeriodBlockMinutes    = 60  // The time for which an instance's failure discovery is kept "active", so as to avoid concurrent "discoveries" of the instance's failure; this preceeds any recovery process, if any.
    54  )
    55  
    56  var (
    57  	sqliteDataFile                 = "file::memory:?mode=memory&cache=shared"
    58  	instancePollTime               = 5 * time.Second
    59  	snapshotTopologyInterval       = 0 * time.Hour
    60  	reasonableReplicationLag       = 10 * time.Second
    61  	auditFileLocation              = ""
    62  	auditToBackend                 = false
    63  	auditToSyslog                  = false
    64  	auditPurgeDuration             = 7 * 24 * time.Hour // Equivalent of 7 days
    65  	recoveryPeriodBlockDuration    = 30 * time.Second
    66  	preventCrossCellFailover       = false
    67  	waitReplicasTimeout            = 30 * time.Second
    68  	topoInformationRefreshDuration = 15 * time.Second
    69  	recoveryPollDuration           = 1 * time.Second
    70  )
    71  
    72  // RegisterFlags registers the flags required by VTOrc
    73  func RegisterFlags(fs *pflag.FlagSet) {
    74  	fs.StringVar(&sqliteDataFile, "sqlite-data-file", sqliteDataFile, "SQLite Datafile to use as VTOrc's database")
    75  	fs.DurationVar(&instancePollTime, "instance-poll-time", instancePollTime, "Timer duration on which VTOrc refreshes MySQL information")
    76  	fs.DurationVar(&snapshotTopologyInterval, "snapshot-topology-interval", snapshotTopologyInterval, "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours")
    77  	fs.DurationVar(&reasonableReplicationLag, "reasonable-replication-lag", reasonableReplicationLag, "Maximum replication lag on replicas which is deemed to be acceptable")
    78  	fs.StringVar(&auditFileLocation, "audit-file-location", auditFileLocation, "File location where the audit logs are to be stored")
    79  	fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database")
    80  	fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog")
    81  	fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days")
    82  	fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery")
    83  	fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover")
    84  	fs.Duration("lock-shard-timeout", 30*time.Second, "Duration for which a shard lock is held when running a recovery")
    85  	_ = fs.MarkDeprecated("lock-shard-timeout", "Please use lock-timeout instead.")
    86  	fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs")
    87  	fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server")
    88  	fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery")
    89  }
    90  
    91  // Configuration makes for vtorc configuration input, which can be provided by user via JSON formatted file.
    92  // Some of the parameteres have reasonable default values, and some (like database credentials) are
    93  // strictly expected from user.
    94  // TODO(sougou): change this to yaml parsing, and possible merge with tabletenv.
    95  type Configuration struct {
    96  	SQLite3DataFile                       string // full path to sqlite3 datafile
    97  	InstancePollSeconds                   uint   // Number of seconds between instance reads
    98  	SnapshotTopologiesIntervalHours       uint   // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled)
    99  	ReasonableReplicationLagSeconds       int    // Above this value is considered a problem
   100  	AuditLogFile                          string // Name of log file for audit operations. Disabled when empty.
   101  	AuditToSyslog                         bool   // If true, audit messages are written to syslog
   102  	AuditToBackendDB                      bool   // If true, audit messages are written to the backend DB's `audit` table (default: true)
   103  	AuditPurgeDays                        uint   // Days after which audit entries are purged from the database
   104  	RecoveryPeriodBlockSeconds            int    // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping
   105  	PreventCrossDataCenterPrimaryFailover bool   // When true (default: false), cross-DC primary failover are not allowed, vtorc will do all it can to only fail over within same DC, or else not fail over at all.
   106  	WaitReplicasTimeoutSeconds            int    // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS.
   107  	TopoInformationRefreshSeconds         int    // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server.
   108  	RecoveryPollSeconds                   int    // Timer duration on which VTOrc recovery analysis runs
   109  }
   110  
   111  // ToJSONString will marshal this configuration as JSON
   112  func (config *Configuration) ToJSONString() string {
   113  	b, _ := json.Marshal(config)
   114  	return string(b)
   115  }
   116  
   117  // Config is *the* configuration instance, used globally to get configuration data
   118  var Config = newConfiguration()
   119  var readFileNames []string
   120  
   121  // UpdateConfigValuesFromFlags is used to update the config values from the flags defined.
   122  // This is done before we read any configuration files from the user. So the config files take precedence.
   123  func UpdateConfigValuesFromFlags() {
   124  	Config.SQLite3DataFile = sqliteDataFile
   125  	Config.InstancePollSeconds = uint(instancePollTime / time.Second)
   126  	Config.InstancePollSeconds = uint(instancePollTime / time.Second)
   127  	Config.SnapshotTopologiesIntervalHours = uint(snapshotTopologyInterval / time.Hour)
   128  	Config.ReasonableReplicationLagSeconds = int(reasonableReplicationLag / time.Second)
   129  	Config.AuditLogFile = auditFileLocation
   130  	Config.AuditToBackendDB = auditToBackend
   131  	Config.AuditToSyslog = auditToSyslog
   132  	Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24))
   133  	Config.RecoveryPeriodBlockSeconds = int(recoveryPeriodBlockDuration / time.Second)
   134  	Config.PreventCrossDataCenterPrimaryFailover = preventCrossCellFailover
   135  	Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second)
   136  	Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second)
   137  	Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second)
   138  }
   139  
   140  // LogConfigValues is used to log the config values.
   141  func LogConfigValues() {
   142  	b, _ := json.MarshalIndent(Config, "", "\t")
   143  	log.Infof("Running with Configuration - %v", string(b))
   144  }
   145  
   146  func newConfiguration() *Configuration {
   147  	return &Configuration{
   148  		SQLite3DataFile:                       "file::memory:?mode=memory&cache=shared",
   149  		InstancePollSeconds:                   5,
   150  		SnapshotTopologiesIntervalHours:       0,
   151  		ReasonableReplicationLagSeconds:       10,
   152  		AuditLogFile:                          "",
   153  		AuditToSyslog:                         false,
   154  		AuditToBackendDB:                      false,
   155  		AuditPurgeDays:                        7,
   156  		RecoveryPeriodBlockSeconds:            30,
   157  		PreventCrossDataCenterPrimaryFailover: false,
   158  		WaitReplicasTimeoutSeconds:            30,
   159  		TopoInformationRefreshSeconds:         15,
   160  		RecoveryPollSeconds:                   1,
   161  	}
   162  }
   163  
   164  func (config *Configuration) postReadAdjustments() error {
   165  	if config.SQLite3DataFile == "" {
   166  		return fmt.Errorf("SQLite3DataFile must be set")
   167  	}
   168  
   169  	return nil
   170  }
   171  
   172  // read reads configuration from given file, or silently skips if the file does not exist.
   173  // If the file does exist, then it is expected to be in valid JSON format or the function bails out.
   174  func read(fileName string) (*Configuration, error) {
   175  	if fileName == "" {
   176  		return Config, fmt.Errorf("Empty file name")
   177  	}
   178  	file, err := os.Open(fileName)
   179  	if err != nil {
   180  		return Config, err
   181  	}
   182  	decoder := json.NewDecoder(file)
   183  	err = decoder.Decode(Config)
   184  	if err == nil {
   185  		log.Infof("Read config: %s", fileName)
   186  	} else {
   187  		log.Fatal("Cannot read config file:", fileName, err)
   188  	}
   189  	if err := Config.postReadAdjustments(); err != nil {
   190  		log.Fatal(err)
   191  	}
   192  	return Config, err
   193  }
   194  
   195  // Read reads configuration from zero, either, some or all given files, in order of input.
   196  // A file can override configuration provided in previous file.
   197  func Read(fileNames ...string) *Configuration {
   198  	for _, fileName := range fileNames {
   199  		_, _ = read(fileName)
   200  	}
   201  	readFileNames = fileNames
   202  	return Config
   203  }
   204  
   205  // ForceRead reads configuration from given file name or bails out if it fails
   206  func ForceRead(fileName string) *Configuration {
   207  	_, err := read(fileName)
   208  	if err != nil {
   209  		log.Fatal("Cannot read config file:", fileName, err)
   210  	}
   211  	readFileNames = []string{fileName}
   212  	return Config
   213  }
   214  
   215  // Reload re-reads configuration from last used files
   216  func Reload(extraFileNames ...string) *Configuration {
   217  	for _, fileName := range readFileNames {
   218  		_, _ = read(fileName)
   219  	}
   220  	for _, fileName := range extraFileNames {
   221  		_, _ = read(fileName)
   222  	}
   223  	return Config
   224  }
   225  
   226  // MarkConfigurationLoaded is called once configuration has first been loaded.
   227  // Listeners on ConfigurationLoaded will get a notification
   228  func MarkConfigurationLoaded() {
   229  	go func() {
   230  		for {
   231  			configurationLoaded <- true
   232  		}
   233  	}()
   234  	// wait for it
   235  	<-configurationLoaded
   236  }
   237  
   238  // WaitForConfigurationToBeLoaded does just that. It will return after
   239  // the configuration file has been read off disk.
   240  func WaitForConfigurationToBeLoaded() {
   241  	<-configurationLoaded
   242  }