vitess.io/vitess@v0.16.2/go/vt/discovery/replicationlag.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package discovery 18 19 import ( 20 "fmt" 21 "sort" 22 "time" 23 24 "github.com/spf13/pflag" 25 26 "vitess.io/vitess/go/vt/servenv" 27 ) 28 29 var ( 30 // lowReplicationLag defines the duration that replication lag is low enough that the VTTablet is considered healthy. 31 lowReplicationLag time.Duration 32 highReplicationLagMinServing time.Duration 33 minNumTablets int 34 legacyReplicationLagAlgorithm bool 35 ) 36 37 func init() { 38 servenv.OnParseFor("vtgate", registerReplicationFlags) 39 } 40 41 func registerReplicationFlags(fs *pflag.FlagSet) { 42 fs.DurationVar(&lowReplicationLag, "discovery_low_replication_lag", 30*time.Second, "Threshold below which replication lag is considered low enough to be healthy.") 43 fs.DurationVar(&highReplicationLagMinServing, "discovery_high_replication_lag_minimum_serving", 2*time.Hour, "Threshold above which replication lag is considered too high when applying the min_number_serving_vttablets flag.") 44 fs.IntVar(&minNumTablets, "min_number_serving_vttablets", 2, "The minimum number of vttablets for each replicating tablet_type (e.g. replica, rdonly) that will be continue to be used even with replication lag above discovery_low_replication_lag, but still below discovery_high_replication_lag_minimum_serving.") 45 fs.BoolVar(&legacyReplicationLagAlgorithm, "legacy_replication_lag_algorithm", true, "Use the legacy algorithm when selecting vttablets for serving.") 46 } 47 48 // GetLowReplicationLag getter for use by debugenv 49 func GetLowReplicationLag() time.Duration { 50 return lowReplicationLag 51 } 52 53 // SetLowReplicationLag setter for use by debugenv 54 func SetLowReplicationLag(lag time.Duration) { 55 lowReplicationLag = lag 56 } 57 58 // GetHighReplicationLagMinServing getter for use by debugenv 59 func GetHighReplicationLagMinServing() time.Duration { 60 return highReplicationLagMinServing 61 } 62 63 // SetHighReplicationLagMinServing setter for use by debugenv 64 func SetHighReplicationLagMinServing(lag time.Duration) { 65 highReplicationLagMinServing = lag 66 } 67 68 // GetMinNumTablets getter for use by debugenv 69 func GetMinNumTablets() int { 70 return minNumTablets 71 } 72 73 // SetMinNumTablets setter for use by debugenv 74 func SetMinNumTablets(numTablets int) { 75 minNumTablets = numTablets 76 } 77 78 // IsReplicationLagHigh verifies that the given LegacytabletHealth refers to a tablet with high 79 // replication lag, i.e. higher than the configured discovery_low_replication_lag flag. 80 func IsReplicationLagHigh(tabletHealth *TabletHealth) bool { 81 return float64(tabletHealth.Stats.ReplicationLagSeconds) > lowReplicationLag.Seconds() 82 } 83 84 // IsReplicationLagVeryHigh verifies that the given LegacytabletHealth refers to a tablet with very high 85 // replication lag, i.e. higher than the configured discovery_high_replication_lag_minimum_serving flag. 86 func IsReplicationLagVeryHigh(tabletHealth *TabletHealth) bool { 87 return float64(tabletHealth.Stats.ReplicationLagSeconds) > highReplicationLagMinServing.Seconds() 88 } 89 90 // FilterStatsByReplicationLag filters the list of TabletHealth by TabletHealth.Stats.ReplicationLagSeconds. 91 // Note that TabletHealth that is non-serving or has error is ignored. 92 // 93 // The simplified logic: 94 // - Return tablets that have lag <= lowReplicationLag. 95 // - Make sure we return at least minNumTablets tablets, if there are enough one with lag <= highReplicationLagMinServing. 96 // For example, with the default of 30s / 2h / 2, this means: 97 // - lags of (5s, 10s, 15s, 120s) return the first three 98 // - lags of (30m, 35m, 40m, 45m) return the first two 99 // - lags of (2h, 3h, 4h, 5h) return the first one 100 // 101 // The legacy algorithm (default for now): 102 // - Return the list if there is 0 or 1 tablet. 103 // - Return the list if all tablets have <=30s lag. 104 // - Filter by replication lag: for each tablet, if the mean value without it is more than 0.7 of the mean value across all tablets, it is valid. 105 // - Make sure we return at least minNumTablets tablets (if there are enough one with only low replication lag). 106 // - If one tablet is removed, run above steps again in case there are two tablets with high replication lag. (It should cover most cases.) 107 // For example, lags of (5s, 10s, 15s, 120s) return the first three; 108 // lags of (30m, 35m, 40m, 45m) return all. 109 // 110 // One thing to know about this code: vttablet also has a couple flags that impact the logic here: 111 // - unhealthy_threshold: if replication lag is higher than this, a tablet will be reported as unhealthy. 112 // The default for this is 2h, same as the discovery_high_replication_lag_minimum_serving here. 113 // - degraded_threshold: this is only used by vttablet for display. It should match 114 // discovery_low_replication_lag here, so the vttablet status display matches what vtgate will do of it. 115 func FilterStatsByReplicationLag(tabletHealthList []*TabletHealth) []*TabletHealth { 116 if !legacyReplicationLagAlgorithm { 117 return filterStatsByLag(tabletHealthList) 118 } 119 res := filterStatsByLagWithLegacyAlgorithm(tabletHealthList) 120 // run the filter again if exactly one tablet is removed, 121 // and we have spare tablets. 122 if len(res) > minNumTablets && len(res) == len(tabletHealthList)-1 { 123 res = filterStatsByLagWithLegacyAlgorithm(res) 124 } 125 return res 126 127 } 128 129 func filterStatsByLag(tabletHealthList []*TabletHealth) []*TabletHealth { 130 list := make([]tabletLagSnapshot, 0, len(tabletHealthList)) 131 // filter non-serving tablets and those with very high replication lag 132 for _, ts := range tabletHealthList { 133 if !ts.Serving || ts.LastError != nil || ts.Stats == nil || IsReplicationLagVeryHigh(ts) { 134 continue 135 } 136 // Pull the current replication lag for a stable sort later. 137 list = append(list, tabletLagSnapshot{ 138 ts: ts, 139 replag: ts.Stats.ReplicationLagSeconds}) 140 } 141 142 // Sort by replication lag. 143 sort.Sort(tabletLagSnapshotList(list)) 144 145 // Pick those with low replication lag, but at least minNumTablets tablets regardless. 146 res := make([]*TabletHealth, 0, len(list)) 147 for i := 0; i < len(list); i++ { 148 if !IsReplicationLagHigh(list[i].ts) || i < minNumTablets { 149 res = append(res, list[i].ts) 150 } 151 } 152 return res 153 } 154 155 func filterStatsByLagWithLegacyAlgorithm(tabletHealthList []*TabletHealth) []*TabletHealth { 156 list := make([]*TabletHealth, 0, len(tabletHealthList)) 157 // filter non-serving tablets 158 for _, ts := range tabletHealthList { 159 if !ts.Serving || ts.LastError != nil || ts.Stats == nil { 160 continue 161 } 162 list = append(list, ts) 163 } 164 if len(list) <= 1 { 165 return list 166 } 167 // if all have low replication lag (<=30s), return all tablets. 168 allLowLag := true 169 for _, ts := range list { 170 if IsReplicationLagHigh(ts) { 171 allLowLag = false 172 break 173 } 174 } 175 if allLowLag { 176 return list 177 } 178 // filter those affecting "mean" lag significantly 179 // calculate mean for all tablets 180 res := make([]*TabletHealth, 0, len(list)) 181 m, _ := mean(list, -1) 182 for i, ts := range list { 183 // calculate mean by excluding ith tablet 184 mi, _ := mean(list, i) 185 if float64(mi) > float64(m)*0.7 { 186 res = append(res, ts) 187 } 188 } 189 if len(res) >= minNumTablets { 190 return res 191 } 192 // return at least minNumTablets tablets to avoid over loading, 193 // if there is enough tablets with replication lag < highReplicationLagMinServing. 194 // Pull the current replication lag for a stable sort. 195 snapshots := make([]tabletLagSnapshot, 0, len(list)) 196 for _, ts := range list { 197 if !IsReplicationLagVeryHigh(ts) { 198 snapshots = append(snapshots, tabletLagSnapshot{ 199 ts: ts, 200 replag: ts.Stats.ReplicationLagSeconds}) 201 } 202 } 203 if len(snapshots) == 0 { 204 // We get here if all tablets are over the high 205 // replication lag threshold, and their lag is 206 // different enough that the 70% mean computation up 207 // there didn't find them all in a group. For 208 // instance, if *minNumTablets = 2, and we have two 209 // tablets with lag of 3h and 30h. In that case, we 210 // just use them all. 211 for _, ts := range list { 212 snapshots = append(snapshots, tabletLagSnapshot{ 213 ts: ts, 214 replag: ts.Stats.ReplicationLagSeconds}) 215 } 216 } 217 218 // Sort by replication lag. 219 sort.Sort(byReplag(snapshots)) 220 221 // Pick the first minNumTablets tablets. 222 res = make([]*TabletHealth, 0, minNumTablets) 223 for i := 0; i < min(minNumTablets, len(snapshots)); i++ { 224 res = append(res, snapshots[i].ts) 225 } 226 return res 227 } 228 229 type byReplag []tabletLagSnapshot 230 231 func (a byReplag) Len() int { return len(a) } 232 func (a byReplag) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 233 func (a byReplag) Less(i, j int) bool { return a[i].replag < a[j].replag } 234 235 type tabletLagSnapshot struct { 236 ts *TabletHealth 237 replag uint32 238 } 239 type tabletLagSnapshotList []tabletLagSnapshot 240 241 func (a tabletLagSnapshotList) Len() int { return len(a) } 242 func (a tabletLagSnapshotList) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 243 func (a tabletLagSnapshotList) Less(i, j int) bool { return a[i].replag < a[j].replag } 244 245 func min(a, b int) int { 246 if a > b { 247 return b 248 } 249 return a 250 } 251 252 // mean calculates the mean value over the given list, 253 // while excluding the item with the specified index. 254 func mean(tabletHealthList []*TabletHealth, idxExclude int) (uint64, error) { 255 var sum uint64 256 var count uint64 257 for i, ts := range tabletHealthList { 258 if i == idxExclude { 259 continue 260 } 261 sum = sum + uint64(ts.Stats.ReplicationLagSeconds) 262 count++ 263 } 264 if count == 0 { 265 return 0, fmt.Errorf("empty list") 266 } 267 return sum / count, nil 268 }