vitess.io/vitess@v0.16.2/go/vt/throttler/replication_lag_cache.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package throttler
    18  
    19  import (
    20  	"sort"
    21  	"time"
    22  
    23  	"vitess.io/vitess/go/vt/discovery"
    24  )
    25  
    26  // replicationLagCache caches for each replica a bounded list of historic
    27  // replicationlagRecord entries.
    28  type replicationLagCache struct {
    29  	// entries maps from the replica to its history.
    30  	// The map key is replicationLagRecord.LegacyTabletStats.Key.
    31  	entries map[string]*replicationLagHistory
    32  
    33  	// slowReplicas is a set of slow replicas.
    34  	// The map key is replicationLagRecord.LegacyTabletStats.Key.
    35  	// This map will always be recomputed by sortByLag() and must not be modified
    36  	// from other methods.
    37  	slowReplicas map[string]bool
    38  
    39  	// ignoredSlowReplicasInARow is a set of slow replicas for which the method
    40  	// ignoreSlowReplica() has returned true.
    41  	// It's used to detect the case where *all* replicas in a row have been
    42  	// ignored. This happens when the lag on every replica increases and each
    43  	// becomes the new slowest replica. This set is used to detect such a chain.
    44  	// The set will be cleared if ignoreSlowReplica() returns false.
    45  	//
    46  	// The map key is replicationLagRecord.LegacyTabletStats.Key.
    47  	// If an entry is deleted from "entries", it must be deleted here as well.
    48  	ignoredSlowReplicasInARow map[string]bool
    49  
    50  	historyCapacityPerReplica int
    51  }
    52  
    53  func newReplicationLagCache(historyCapacityPerReplica int) *replicationLagCache {
    54  	return &replicationLagCache{
    55  		entries:                   make(map[string]*replicationLagHistory),
    56  		ignoredSlowReplicasInARow: make(map[string]bool),
    57  		historyCapacityPerReplica: historyCapacityPerReplica,
    58  	}
    59  }
    60  
    61  // add inserts or updates "r" in the cache for the replica with the key "r.Key".
    62  func (c *replicationLagCache) add(r replicationLagRecord) {
    63  	if !r.Serving {
    64  		// Tablet is down. Do no longer track it.
    65  		delete(c.entries, discovery.TabletToMapKey(r.Tablet))
    66  		delete(c.ignoredSlowReplicasInARow, discovery.TabletToMapKey(r.Tablet))
    67  		return
    68  	}
    69  
    70  	entry, ok := c.entries[discovery.TabletToMapKey(r.Tablet)]
    71  	if !ok {
    72  		entry = newReplicationLagHistory(c.historyCapacityPerReplica)
    73  		c.entries[discovery.TabletToMapKey(r.Tablet)] = entry
    74  	}
    75  
    76  	entry.add(r)
    77  }
    78  
    79  // latest returns the current lag record for the given LegacyTabletStats.Key string.
    80  // A zero record is returned if there is no latest entry.
    81  func (c *replicationLagCache) latest(key string) replicationLagRecord {
    82  	entry, ok := c.entries[key]
    83  	if !ok {
    84  		return replicationLagRecord{}
    85  	}
    86  	return entry.latest()
    87  }
    88  
    89  // atOrAfter returns the oldest replicationLagRecord which happened at "at"
    90  // or just after it.
    91  // If there is no such record, a zero record is returned.
    92  func (c *replicationLagCache) atOrAfter(key string, at time.Time) replicationLagRecord {
    93  	entry, ok := c.entries[key]
    94  	if !ok {
    95  		return replicationLagRecord{}
    96  	}
    97  	return entry.atOrAfter(at)
    98  }
    99  
   100  // sortByLag sorts all replicas by their latest replication lag value and
   101  // tablet uid and updates the c.slowReplicas set.
   102  func (c *replicationLagCache) sortByLag(ignoreNSlowestReplicas int, minimumReplicationLag int64) {
   103  	// Reset the current list of ignored replicas.
   104  	c.slowReplicas = make(map[string]bool)
   105  
   106  	if ignoreNSlowestReplicas >= len(c.entries) {
   107  		// Do not ignore slow replicas if all would get ignored.
   108  		return
   109  	}
   110  
   111  	// Turn the map of replicas into a list and then sort it.
   112  	var list byLagAndTabletUID
   113  	i := 0
   114  	for _, v := range c.entries {
   115  		record := v.latest()
   116  		if int64(record.Stats.ReplicationLagSeconds) >= minimumReplicationLag {
   117  			list = append(list, record.TabletHealth)
   118  			i++
   119  		}
   120  	}
   121  	sort.Sort(list)
   122  
   123  	// Now remember the N slowest replicas.
   124  	for i := len(list) - 1; len(list) > 0 && i >= len(list)-ignoreNSlowestReplicas; i-- {
   125  		c.slowReplicas[discovery.TabletToMapKey(list[i].Tablet)] = true
   126  	}
   127  }
   128  
   129  // byLagAndTabletUID is a slice of discovery.TabletHealth elements that
   130  // implements sort.Interface to sort by replication lag and tablet Uid.
   131  type byLagAndTabletUID []discovery.TabletHealth
   132  
   133  func (a byLagAndTabletUID) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
   134  func (a byLagAndTabletUID) Len() int      { return len(a) }
   135  func (a byLagAndTabletUID) Less(i, j int) bool {
   136  	return a[i].Stats.ReplicationLagSeconds < a[j].Stats.ReplicationLagSeconds ||
   137  		(a[i].Stats.ReplicationLagSeconds == a[j].Stats.ReplicationLagSeconds &&
   138  			a[i].Tablet.Alias.Uid < a[j].Tablet.Alias.Uid)
   139  }
   140  
   141  // ignoreSlowReplica returns true if the MaxReplicationLagModule should ignore
   142  // this slow replica.
   143  // "key" refers to ReplicationLagRecord.LegacyTabletStats.Key.
   144  func (c *replicationLagCache) ignoreSlowReplica(key string) bool {
   145  	if len(c.slowReplicas) == 0 {
   146  		// No slow replicas at all.
   147  		return false
   148  	}
   149  
   150  	slow := c.slowReplicas[key]
   151  	if slow {
   152  		// Record that we're ignoring this replica.
   153  		c.ignoredSlowReplicasInARow[key] = true
   154  
   155  		if len(c.ignoredSlowReplicasInARow) == len(c.entries) {
   156  			// All but this replica have been ignored in a row. Break this cycle now.
   157  			slow = false
   158  		}
   159  	}
   160  
   161  	if !slow {
   162  		// Replica is not slow.
   163  		if len(c.ignoredSlowReplicasInARow) != 0 {
   164  			// Reset the set of replicas which have been slow in a row so far.
   165  			c.ignoredSlowReplicasInARow = make(map[string]bool)
   166  		}
   167  	}
   168  	return slow
   169  }
   170  
   171  // isIgnored returns true if the given replica is a slow, ignored replica.
   172  // "key" refers to ReplicationLagRecord.LegacyTabletStats.Key.
   173  // Note: Unlike ignoreSlowReplica(key), this method does not update the count
   174  // how many replicas in a row have been ignored. Instead, it's meant to find out
   175  // when a replica is ignored and therefore the module should not wait for it.
   176  func (c *replicationLagCache) isIgnored(key string) bool {
   177  	return c.slowReplicas[key]
   178  }
   179  
   180  // replicationLagHistory stores the most recent replicationLagRecord entries
   181  // in a ring buffer for a single replica.
   182  type replicationLagHistory struct {
   183  	records []replicationLagRecord
   184  	// current has the index in "records" of the last element added by add().
   185  	current int
   186  }
   187  
   188  func newReplicationLagHistory(capacity int) *replicationLagHistory {
   189  	return &replicationLagHistory{
   190  		records: make([]replicationLagRecord, capacity),
   191  		current: -1,
   192  	}
   193  }
   194  
   195  func (h *replicationLagHistory) add(r replicationLagRecord) {
   196  	h.advanceCurrent()
   197  	h.records[h.current] = r
   198  }
   199  
   200  func (h *replicationLagHistory) latest() replicationLagRecord {
   201  	return h.records[h.current]
   202  }
   203  
   204  // atOrAfter returns the oldest replicationLagRecord which happened at "at"
   205  // or just after it.
   206  // If there is no such record, a zero record is returned.
   207  func (h *replicationLagHistory) atOrAfter(at time.Time) replicationLagRecord {
   208  	wrapped := false
   209  	i := h.current
   210  	for {
   211  		// Look at the previous (older) entry to decide if we should return the
   212  		// current entry.
   213  		prev := i - 1
   214  		if prev < 0 {
   215  			wrapped = true
   216  			prev = len(h.records) - 1
   217  		}
   218  
   219  		if h.records[prev].isZero() || h.records[prev].time.Before(at) {
   220  			// Return this entry because the previous one does not exist or
   221  			// it happened before the time we're interested in.
   222  			return h.records[i]
   223  		}
   224  		if wrapped && prev == h.current {
   225  			// We scanned the whole list and all entries match. Return the oldest.
   226  			return h.records[i]
   227  		}
   228  
   229  		i = prev
   230  	}
   231  }
   232  
   233  func (h *replicationLagHistory) advanceCurrent() {
   234  	h.current++
   235  	if h.current == len(h.records) {
   236  		h.current = 0
   237  	}
   238  }