vitess.io/vitess@v0.16.2/go/vt/throttler/replication_lag_cache.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package throttler 18 19 import ( 20 "sort" 21 "time" 22 23 "vitess.io/vitess/go/vt/discovery" 24 ) 25 26 // replicationLagCache caches for each replica a bounded list of historic 27 // replicationlagRecord entries. 28 type replicationLagCache struct { 29 // entries maps from the replica to its history. 30 // The map key is replicationLagRecord.LegacyTabletStats.Key. 31 entries map[string]*replicationLagHistory 32 33 // slowReplicas is a set of slow replicas. 34 // The map key is replicationLagRecord.LegacyTabletStats.Key. 35 // This map will always be recomputed by sortByLag() and must not be modified 36 // from other methods. 37 slowReplicas map[string]bool 38 39 // ignoredSlowReplicasInARow is a set of slow replicas for which the method 40 // ignoreSlowReplica() has returned true. 41 // It's used to detect the case where *all* replicas in a row have been 42 // ignored. This happens when the lag on every replica increases and each 43 // becomes the new slowest replica. This set is used to detect such a chain. 44 // The set will be cleared if ignoreSlowReplica() returns false. 45 // 46 // The map key is replicationLagRecord.LegacyTabletStats.Key. 47 // If an entry is deleted from "entries", it must be deleted here as well. 48 ignoredSlowReplicasInARow map[string]bool 49 50 historyCapacityPerReplica int 51 } 52 53 func newReplicationLagCache(historyCapacityPerReplica int) *replicationLagCache { 54 return &replicationLagCache{ 55 entries: make(map[string]*replicationLagHistory), 56 ignoredSlowReplicasInARow: make(map[string]bool), 57 historyCapacityPerReplica: historyCapacityPerReplica, 58 } 59 } 60 61 // add inserts or updates "r" in the cache for the replica with the key "r.Key". 62 func (c *replicationLagCache) add(r replicationLagRecord) { 63 if !r.Serving { 64 // Tablet is down. Do no longer track it. 65 delete(c.entries, discovery.TabletToMapKey(r.Tablet)) 66 delete(c.ignoredSlowReplicasInARow, discovery.TabletToMapKey(r.Tablet)) 67 return 68 } 69 70 entry, ok := c.entries[discovery.TabletToMapKey(r.Tablet)] 71 if !ok { 72 entry = newReplicationLagHistory(c.historyCapacityPerReplica) 73 c.entries[discovery.TabletToMapKey(r.Tablet)] = entry 74 } 75 76 entry.add(r) 77 } 78 79 // latest returns the current lag record for the given LegacyTabletStats.Key string. 80 // A zero record is returned if there is no latest entry. 81 func (c *replicationLagCache) latest(key string) replicationLagRecord { 82 entry, ok := c.entries[key] 83 if !ok { 84 return replicationLagRecord{} 85 } 86 return entry.latest() 87 } 88 89 // atOrAfter returns the oldest replicationLagRecord which happened at "at" 90 // or just after it. 91 // If there is no such record, a zero record is returned. 92 func (c *replicationLagCache) atOrAfter(key string, at time.Time) replicationLagRecord { 93 entry, ok := c.entries[key] 94 if !ok { 95 return replicationLagRecord{} 96 } 97 return entry.atOrAfter(at) 98 } 99 100 // sortByLag sorts all replicas by their latest replication lag value and 101 // tablet uid and updates the c.slowReplicas set. 102 func (c *replicationLagCache) sortByLag(ignoreNSlowestReplicas int, minimumReplicationLag int64) { 103 // Reset the current list of ignored replicas. 104 c.slowReplicas = make(map[string]bool) 105 106 if ignoreNSlowestReplicas >= len(c.entries) { 107 // Do not ignore slow replicas if all would get ignored. 108 return 109 } 110 111 // Turn the map of replicas into a list and then sort it. 112 var list byLagAndTabletUID 113 i := 0 114 for _, v := range c.entries { 115 record := v.latest() 116 if int64(record.Stats.ReplicationLagSeconds) >= minimumReplicationLag { 117 list = append(list, record.TabletHealth) 118 i++ 119 } 120 } 121 sort.Sort(list) 122 123 // Now remember the N slowest replicas. 124 for i := len(list) - 1; len(list) > 0 && i >= len(list)-ignoreNSlowestReplicas; i-- { 125 c.slowReplicas[discovery.TabletToMapKey(list[i].Tablet)] = true 126 } 127 } 128 129 // byLagAndTabletUID is a slice of discovery.TabletHealth elements that 130 // implements sort.Interface to sort by replication lag and tablet Uid. 131 type byLagAndTabletUID []discovery.TabletHealth 132 133 func (a byLagAndTabletUID) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 134 func (a byLagAndTabletUID) Len() int { return len(a) } 135 func (a byLagAndTabletUID) Less(i, j int) bool { 136 return a[i].Stats.ReplicationLagSeconds < a[j].Stats.ReplicationLagSeconds || 137 (a[i].Stats.ReplicationLagSeconds == a[j].Stats.ReplicationLagSeconds && 138 a[i].Tablet.Alias.Uid < a[j].Tablet.Alias.Uid) 139 } 140 141 // ignoreSlowReplica returns true if the MaxReplicationLagModule should ignore 142 // this slow replica. 143 // "key" refers to ReplicationLagRecord.LegacyTabletStats.Key. 144 func (c *replicationLagCache) ignoreSlowReplica(key string) bool { 145 if len(c.slowReplicas) == 0 { 146 // No slow replicas at all. 147 return false 148 } 149 150 slow := c.slowReplicas[key] 151 if slow { 152 // Record that we're ignoring this replica. 153 c.ignoredSlowReplicasInARow[key] = true 154 155 if len(c.ignoredSlowReplicasInARow) == len(c.entries) { 156 // All but this replica have been ignored in a row. Break this cycle now. 157 slow = false 158 } 159 } 160 161 if !slow { 162 // Replica is not slow. 163 if len(c.ignoredSlowReplicasInARow) != 0 { 164 // Reset the set of replicas which have been slow in a row so far. 165 c.ignoredSlowReplicasInARow = make(map[string]bool) 166 } 167 } 168 return slow 169 } 170 171 // isIgnored returns true if the given replica is a slow, ignored replica. 172 // "key" refers to ReplicationLagRecord.LegacyTabletStats.Key. 173 // Note: Unlike ignoreSlowReplica(key), this method does not update the count 174 // how many replicas in a row have been ignored. Instead, it's meant to find out 175 // when a replica is ignored and therefore the module should not wait for it. 176 func (c *replicationLagCache) isIgnored(key string) bool { 177 return c.slowReplicas[key] 178 } 179 180 // replicationLagHistory stores the most recent replicationLagRecord entries 181 // in a ring buffer for a single replica. 182 type replicationLagHistory struct { 183 records []replicationLagRecord 184 // current has the index in "records" of the last element added by add(). 185 current int 186 } 187 188 func newReplicationLagHistory(capacity int) *replicationLagHistory { 189 return &replicationLagHistory{ 190 records: make([]replicationLagRecord, capacity), 191 current: -1, 192 } 193 } 194 195 func (h *replicationLagHistory) add(r replicationLagRecord) { 196 h.advanceCurrent() 197 h.records[h.current] = r 198 } 199 200 func (h *replicationLagHistory) latest() replicationLagRecord { 201 return h.records[h.current] 202 } 203 204 // atOrAfter returns the oldest replicationLagRecord which happened at "at" 205 // or just after it. 206 // If there is no such record, a zero record is returned. 207 func (h *replicationLagHistory) atOrAfter(at time.Time) replicationLagRecord { 208 wrapped := false 209 i := h.current 210 for { 211 // Look at the previous (older) entry to decide if we should return the 212 // current entry. 213 prev := i - 1 214 if prev < 0 { 215 wrapped = true 216 prev = len(h.records) - 1 217 } 218 219 if h.records[prev].isZero() || h.records[prev].time.Before(at) { 220 // Return this entry because the previous one does not exist or 221 // it happened before the time we're interested in. 222 return h.records[i] 223 } 224 if wrapped && prev == h.current { 225 // We scanned the whole list and all entries match. Return the oldest. 226 return h.records[i] 227 } 228 229 i = prev 230 } 231 } 232 233 func (h *replicationLagHistory) advanceCurrent() { 234 h.current++ 235 if h.current == len(h.records) { 236 h.current = 0 237 } 238 }