vitess.io/vitess@v0.16.2/go/vt/vtorc/config/config.go (about) 1 /* 2 Copyright 2014 Outbrain Inc. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package config 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "os" 23 "time" 24 25 "github.com/spf13/pflag" 26 27 "vitess.io/vitess/go/vt/log" 28 ) 29 30 const ( 31 LostInRecoveryDowntimeSeconds int = 60 * 60 * 24 * 365 32 ) 33 34 var configurationLoaded = make(chan bool) 35 36 const ( 37 HealthPollSeconds = 1 38 ActiveNodeExpireSeconds = 5 39 MaintenanceOwner = "vtorc" 40 AuditPageSize = 20 41 MaintenancePurgeDays = 7 42 MaintenanceExpireMinutes = 10 43 DebugMetricsIntervalSeconds = 10 44 StaleInstanceCoordinatesExpireSeconds = 60 45 DiscoveryMaxConcurrency = 300 // Number of goroutines doing hosts discovery 46 DiscoveryQueueCapacity = 100000 47 DiscoveryQueueMaxStatisticsSize = 120 48 DiscoveryCollectionRetentionSeconds = 120 49 HostnameResolveMethod = "default" 50 UnseenInstanceForgetHours = 240 // Number of hours after which an unseen instance is forgotten 51 ExpiryHostnameResolvesMinutes = 60 // Number of minutes after which to expire hostname-resolves 52 CandidateInstanceExpireMinutes = 60 // Minutes after which a suggestion to use an instance as a candidate replica (to be preferably promoted on primary failover) is expired. 53 FailureDetectionPeriodBlockMinutes = 60 // The time for which an instance's failure discovery is kept "active", so as to avoid concurrent "discoveries" of the instance's failure; this preceeds any recovery process, if any. 54 ) 55 56 var ( 57 sqliteDataFile = "file::memory:?mode=memory&cache=shared" 58 instancePollTime = 5 * time.Second 59 snapshotTopologyInterval = 0 * time.Hour 60 reasonableReplicationLag = 10 * time.Second 61 auditFileLocation = "" 62 auditToBackend = false 63 auditToSyslog = false 64 auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days 65 recoveryPeriodBlockDuration = 30 * time.Second 66 preventCrossCellFailover = false 67 waitReplicasTimeout = 30 * time.Second 68 topoInformationRefreshDuration = 15 * time.Second 69 recoveryPollDuration = 1 * time.Second 70 ) 71 72 // RegisterFlags registers the flags required by VTOrc 73 func RegisterFlags(fs *pflag.FlagSet) { 74 fs.StringVar(&sqliteDataFile, "sqlite-data-file", sqliteDataFile, "SQLite Datafile to use as VTOrc's database") 75 fs.DurationVar(&instancePollTime, "instance-poll-time", instancePollTime, "Timer duration on which VTOrc refreshes MySQL information") 76 fs.DurationVar(&snapshotTopologyInterval, "snapshot-topology-interval", snapshotTopologyInterval, "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours") 77 fs.DurationVar(&reasonableReplicationLag, "reasonable-replication-lag", reasonableReplicationLag, "Maximum replication lag on replicas which is deemed to be acceptable") 78 fs.StringVar(&auditFileLocation, "audit-file-location", auditFileLocation, "File location where the audit logs are to be stored") 79 fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") 80 fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") 81 fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") 82 fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery") 83 fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") 84 fs.Duration("lock-shard-timeout", 30*time.Second, "Duration for which a shard lock is held when running a recovery") 85 _ = fs.MarkDeprecated("lock-shard-timeout", "Please use lock-timeout instead.") 86 fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") 87 fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") 88 fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery") 89 } 90 91 // Configuration makes for vtorc configuration input, which can be provided by user via JSON formatted file. 92 // Some of the parameteres have reasonable default values, and some (like database credentials) are 93 // strictly expected from user. 94 // TODO(sougou): change this to yaml parsing, and possible merge with tabletenv. 95 type Configuration struct { 96 SQLite3DataFile string // full path to sqlite3 datafile 97 InstancePollSeconds uint // Number of seconds between instance reads 98 SnapshotTopologiesIntervalHours uint // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled) 99 ReasonableReplicationLagSeconds int // Above this value is considered a problem 100 AuditLogFile string // Name of log file for audit operations. Disabled when empty. 101 AuditToSyslog bool // If true, audit messages are written to syslog 102 AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) 103 AuditPurgeDays uint // Days after which audit entries are purged from the database 104 RecoveryPeriodBlockSeconds int // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping 105 PreventCrossDataCenterPrimaryFailover bool // When true (default: false), cross-DC primary failover are not allowed, vtorc will do all it can to only fail over within same DC, or else not fail over at all. 106 WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. 107 TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. 108 RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs 109 } 110 111 // ToJSONString will marshal this configuration as JSON 112 func (config *Configuration) ToJSONString() string { 113 b, _ := json.Marshal(config) 114 return string(b) 115 } 116 117 // Config is *the* configuration instance, used globally to get configuration data 118 var Config = newConfiguration() 119 var readFileNames []string 120 121 // UpdateConfigValuesFromFlags is used to update the config values from the flags defined. 122 // This is done before we read any configuration files from the user. So the config files take precedence. 123 func UpdateConfigValuesFromFlags() { 124 Config.SQLite3DataFile = sqliteDataFile 125 Config.InstancePollSeconds = uint(instancePollTime / time.Second) 126 Config.InstancePollSeconds = uint(instancePollTime / time.Second) 127 Config.SnapshotTopologiesIntervalHours = uint(snapshotTopologyInterval / time.Hour) 128 Config.ReasonableReplicationLagSeconds = int(reasonableReplicationLag / time.Second) 129 Config.AuditLogFile = auditFileLocation 130 Config.AuditToBackendDB = auditToBackend 131 Config.AuditToSyslog = auditToSyslog 132 Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24)) 133 Config.RecoveryPeriodBlockSeconds = int(recoveryPeriodBlockDuration / time.Second) 134 Config.PreventCrossDataCenterPrimaryFailover = preventCrossCellFailover 135 Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) 136 Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) 137 Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) 138 } 139 140 // LogConfigValues is used to log the config values. 141 func LogConfigValues() { 142 b, _ := json.MarshalIndent(Config, "", "\t") 143 log.Infof("Running with Configuration - %v", string(b)) 144 } 145 146 func newConfiguration() *Configuration { 147 return &Configuration{ 148 SQLite3DataFile: "file::memory:?mode=memory&cache=shared", 149 InstancePollSeconds: 5, 150 SnapshotTopologiesIntervalHours: 0, 151 ReasonableReplicationLagSeconds: 10, 152 AuditLogFile: "", 153 AuditToSyslog: false, 154 AuditToBackendDB: false, 155 AuditPurgeDays: 7, 156 RecoveryPeriodBlockSeconds: 30, 157 PreventCrossDataCenterPrimaryFailover: false, 158 WaitReplicasTimeoutSeconds: 30, 159 TopoInformationRefreshSeconds: 15, 160 RecoveryPollSeconds: 1, 161 } 162 } 163 164 func (config *Configuration) postReadAdjustments() error { 165 if config.SQLite3DataFile == "" { 166 return fmt.Errorf("SQLite3DataFile must be set") 167 } 168 169 return nil 170 } 171 172 // read reads configuration from given file, or silently skips if the file does not exist. 173 // If the file does exist, then it is expected to be in valid JSON format or the function bails out. 174 func read(fileName string) (*Configuration, error) { 175 if fileName == "" { 176 return Config, fmt.Errorf("Empty file name") 177 } 178 file, err := os.Open(fileName) 179 if err != nil { 180 return Config, err 181 } 182 decoder := json.NewDecoder(file) 183 err = decoder.Decode(Config) 184 if err == nil { 185 log.Infof("Read config: %s", fileName) 186 } else { 187 log.Fatal("Cannot read config file:", fileName, err) 188 } 189 if err := Config.postReadAdjustments(); err != nil { 190 log.Fatal(err) 191 } 192 return Config, err 193 } 194 195 // Read reads configuration from zero, either, some or all given files, in order of input. 196 // A file can override configuration provided in previous file. 197 func Read(fileNames ...string) *Configuration { 198 for _, fileName := range fileNames { 199 _, _ = read(fileName) 200 } 201 readFileNames = fileNames 202 return Config 203 } 204 205 // ForceRead reads configuration from given file name or bails out if it fails 206 func ForceRead(fileName string) *Configuration { 207 _, err := read(fileName) 208 if err != nil { 209 log.Fatal("Cannot read config file:", fileName, err) 210 } 211 readFileNames = []string{fileName} 212 return Config 213 } 214 215 // Reload re-reads configuration from last used files 216 func Reload(extraFileNames ...string) *Configuration { 217 for _, fileName := range readFileNames { 218 _, _ = read(fileName) 219 } 220 for _, fileName := range extraFileNames { 221 _, _ = read(fileName) 222 } 223 return Config 224 } 225 226 // MarkConfigurationLoaded is called once configuration has first been loaded. 227 // Listeners on ConfigurationLoaded will get a notification 228 func MarkConfigurationLoaded() { 229 go func() { 230 for { 231 configurationLoaded <- true 232 } 233 }() 234 // wait for it 235 <-configurationLoaded 236 } 237 238 // WaitForConfigurationToBeLoaded does just that. It will return after 239 // the configuration file has been read off disk. 240 func WaitForConfigurationToBeLoaded() { 241 <-configurationLoaded 242 }