vitess.io/vitess@v0.16.2/go/vt/discovery/topology_watcher.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package discovery 18 19 import ( 20 "bytes" 21 "fmt" 22 "hash/crc32" 23 "sort" 24 "strings" 25 "sync" 26 "time" 27 28 "vitess.io/vitess/go/vt/topo/topoproto" 29 30 "vitess.io/vitess/go/vt/key" 31 32 "context" 33 34 "vitess.io/vitess/go/stats" 35 "vitess.io/vitess/go/trace" 36 37 "vitess.io/vitess/go/vt/log" 38 "vitess.io/vitess/go/vt/proto/topodata" 39 "vitess.io/vitess/go/vt/topo" 40 ) 41 42 const ( 43 topologyWatcherOpListTablets = "ListTablets" 44 topologyWatcherOpGetTablet = "GetTablet" 45 topologyWatcherOpAddTablet = "AddTablet" 46 topologyWatcherOpRemoveTablet = "RemoveTablet" 47 topologyWatcherOpReplaceTablet = "ReplaceTablet" 48 ) 49 50 var ( 51 topologyWatcherOperations = stats.NewCountersWithSingleLabel("TopologyWatcherOperations", "Topology watcher operation counts", 52 "Operation", topologyWatcherOpListTablets, topologyWatcherOpGetTablet, topologyWatcherOpAddTablet, topologyWatcherOpRemoveTablet, topologyWatcherOpReplaceTablet) 53 topologyWatcherErrors = stats.NewCountersWithSingleLabel("TopologyWatcherErrors", "Topology watcher error counts", 54 "Operation", topologyWatcherOpListTablets, topologyWatcherOpGetTablet) 55 ) 56 57 // tabletInfo is used internally by the TopologyWatcher class 58 type tabletInfo struct { 59 alias string 60 tablet *topodata.Tablet 61 } 62 63 // TopologyWatcher polls tablet from a configurable set of tablets 64 // periodically. When tablets are added / removed, it calls 65 // the LegacyTabletRecorder AddTablet / RemoveTablet interface appropriately. 66 type TopologyWatcher struct { 67 // set at construction time 68 topoServer *topo.Server 69 healthcheck HealthCheck 70 tabletFilter TabletFilter 71 cell string 72 refreshInterval time.Duration 73 refreshKnownTablets bool 74 getTablets func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error) 75 sem chan int 76 ctx context.Context 77 cancelFunc context.CancelFunc 78 // wg keeps track of all launched Go routines. 79 wg sync.WaitGroup 80 81 // mu protects all variables below 82 mu sync.Mutex 83 // tablets contains a map of alias -> tabletInfo for all known tablets 84 tablets map[string]*tabletInfo 85 // topoChecksum stores a crc32 of the tablets map and is exported as a metric 86 topoChecksum uint32 87 // lastRefresh records the timestamp of the last topo refresh 88 lastRefresh time.Time 89 // firstLoadDone is true when first load of the topology data is done. 90 firstLoadDone bool 91 // firstLoadChan is closed when the initial loading of topology data is done. 92 firstLoadChan chan struct{} 93 } 94 95 // NewTopologyWatcher returns a TopologyWatcher that monitors all 96 // the tablets in a cell, and starts refreshing. 97 func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, hc HealthCheck, filter TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int, getTablets func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error)) *TopologyWatcher { 98 tw := &TopologyWatcher{ 99 topoServer: topoServer, 100 healthcheck: hc, 101 tabletFilter: filter, 102 cell: cell, 103 refreshInterval: refreshInterval, 104 refreshKnownTablets: refreshKnownTablets, 105 getTablets: getTablets, 106 sem: make(chan int, topoReadConcurrency), 107 tablets: make(map[string]*tabletInfo), 108 } 109 tw.firstLoadChan = make(chan struct{}) 110 111 // We want the span from the context, but not the cancelation that comes with it 112 spanContext := trace.CopySpan(context.Background(), ctx) 113 tw.ctx, tw.cancelFunc = context.WithCancel(spanContext) 114 return tw 115 } 116 117 // NewCellTabletsWatcher returns a TopologyWatcher that monitors all 118 // the tablets in a cell, and starts refreshing. 119 func NewCellTabletsWatcher(ctx context.Context, topoServer *topo.Server, hc HealthCheck, f TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int) *TopologyWatcher { 120 return NewTopologyWatcher(ctx, topoServer, hc, f, cell, refreshInterval, refreshKnownTablets, topoReadConcurrency, func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error) { 121 return tw.topoServer.GetTabletAliasesByCell(ctx, tw.cell) 122 }) 123 } 124 125 // Start starts the topology watcher 126 func (tw *TopologyWatcher) Start() { 127 tw.wg.Add(1) 128 go func(t *TopologyWatcher) { 129 defer t.wg.Done() 130 ticker := time.NewTicker(t.refreshInterval) 131 defer ticker.Stop() 132 for { 133 t.loadTablets() 134 select { 135 case <-t.ctx.Done(): 136 return 137 case <-ticker.C: 138 } 139 } 140 }(tw) 141 } 142 143 // Stop stops the watcher. It does not clean up the tablets added to LegacyTabletRecorder. 144 func (tw *TopologyWatcher) Stop() { 145 tw.cancelFunc() 146 // wait for watch goroutine to finish. 147 tw.wg.Wait() 148 } 149 150 func (tw *TopologyWatcher) loadTablets() { 151 var wg sync.WaitGroup 152 newTablets := make(map[string]*tabletInfo) 153 154 // first get the list of relevant tabletAliases 155 tabletAliases, err := tw.getTablets(tw) 156 topologyWatcherOperations.Add(topologyWatcherOpListTablets, 1) 157 if err != nil { 158 topologyWatcherErrors.Add(topologyWatcherOpListTablets, 1) 159 select { 160 case <-tw.ctx.Done(): 161 return 162 default: 163 } 164 log.Errorf("cannot get tablets for cell: %v: %v", tw.cell, err) 165 return 166 } 167 168 // Accumulate a list of all known alias strings to use later 169 // when sorting 170 tabletAliasStrs := make([]string, 0, len(tabletAliases)) 171 172 tw.mu.Lock() 173 for _, tAlias := range tabletAliases { 174 aliasStr := topoproto.TabletAliasString(tAlias) 175 tabletAliasStrs = append(tabletAliasStrs, aliasStr) 176 177 if !tw.refreshKnownTablets { 178 // we already have a tabletInfo for this and the flag tells us to not refresh 179 if val, ok := tw.tablets[aliasStr]; ok { 180 newTablets[aliasStr] = val 181 continue 182 } 183 } 184 185 wg.Add(1) 186 go func(alias *topodata.TabletAlias) { 187 defer wg.Done() 188 tw.sem <- 1 // Wait for active queue to drain. 189 tablet, err := tw.topoServer.GetTablet(tw.ctx, alias) 190 topologyWatcherOperations.Add(topologyWatcherOpGetTablet, 1) 191 <-tw.sem // Done; enable next request to run 192 if err != nil { 193 topologyWatcherErrors.Add(topologyWatcherOpGetTablet, 1) 194 select { 195 case <-tw.ctx.Done(): 196 return 197 default: 198 } 199 log.Errorf("cannot get tablet for alias %v: %v", alias, err) 200 return 201 } 202 tw.mu.Lock() 203 aliasStr := topoproto.TabletAliasString(alias) 204 newTablets[aliasStr] = &tabletInfo{ 205 alias: aliasStr, 206 tablet: tablet.Tablet, 207 } 208 tw.mu.Unlock() 209 }(tAlias) 210 } 211 212 tw.mu.Unlock() 213 wg.Wait() 214 tw.mu.Lock() 215 216 for alias, newVal := range newTablets { 217 if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) { 218 continue 219 } 220 221 // trust the alias from topo and add it if it doesn't exist 222 if val, ok := tw.tablets[alias]; ok { 223 // check if the host and port have changed. If yes, replace tablet. 224 oldKey := TabletToMapKey(val.tablet) 225 newKey := TabletToMapKey(newVal.tablet) 226 if oldKey != newKey { 227 // This is the case where the same tablet alias is now reporting 228 // a different address (host:port) key. 229 tw.healthcheck.ReplaceTablet(val.tablet, newVal.tablet) 230 topologyWatcherOperations.Add(topologyWatcherOpReplaceTablet, 1) 231 } 232 } else { 233 // This is a new tablet record, let's add it to the healthcheck 234 tw.healthcheck.AddTablet(newVal.tablet) 235 topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1) 236 } 237 } 238 239 for _, val := range tw.tablets { 240 if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(val.tablet) { 241 continue 242 } 243 244 if _, ok := newTablets[val.alias]; !ok { 245 tw.healthcheck.RemoveTablet(val.tablet) 246 topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1) 247 } 248 } 249 tw.tablets = newTablets 250 if !tw.firstLoadDone { 251 tw.firstLoadDone = true 252 close(tw.firstLoadChan) 253 } 254 255 // iterate through the tablets in a stable order and compute a 256 // checksum of the tablet map 257 sort.Strings(tabletAliasStrs) 258 var buf bytes.Buffer 259 for _, alias := range tabletAliasStrs { 260 _, ok := tw.tablets[alias] 261 if ok { 262 buf.WriteString(alias) 263 } 264 } 265 tw.topoChecksum = crc32.ChecksumIEEE(buf.Bytes()) 266 tw.lastRefresh = time.Now() 267 268 tw.mu.Unlock() 269 270 } 271 272 // RefreshLag returns the time since the last refresh 273 func (tw *TopologyWatcher) RefreshLag() time.Duration { 274 tw.mu.Lock() 275 defer tw.mu.Unlock() 276 277 return time.Since(tw.lastRefresh) 278 } 279 280 // TopoChecksum returns the checksum of the current state of the topo 281 func (tw *TopologyWatcher) TopoChecksum() uint32 { 282 tw.mu.Lock() 283 defer tw.mu.Unlock() 284 285 return tw.topoChecksum 286 } 287 288 // TabletFilter is an interface that can be given to a TopologyWatcher 289 // to be applied as an additional filter on the list of tablets returned by its getTablets function 290 type TabletFilter interface { 291 // IsIncluded returns whether tablet is included in this filter 292 IsIncluded(tablet *topodata.Tablet) bool 293 } 294 295 // FilterByShard is a filter that filters tablets by 296 // keyspace/shard. 297 type FilterByShard struct { 298 // filters is a map of keyspace to filters for shards 299 filters map[string][]*filterShard 300 } 301 302 // filterShard describes a filter for a given shard or keyrange inside 303 // a keyspace 304 type filterShard struct { 305 keyspace string 306 shard string 307 keyRange *topodata.KeyRange // only set if shard is also a KeyRange 308 } 309 310 // NewFilterByShard creates a new FilterByShard on top of an existing 311 // LegacyTabletRecorder. Each filter is a keyspace|shard entry, where shard 312 // can either be a shard name, or a keyrange. All tablets that match 313 // at least one keyspace|shard tuple will be forwarded to the 314 // underlying LegacyTabletRecorder. 315 func NewFilterByShard(filters []string) (*FilterByShard, error) { 316 m := make(map[string][]*filterShard) 317 for _, filter := range filters { 318 parts := strings.Split(filter, "|") 319 if len(parts) != 2 { 320 return nil, fmt.Errorf("invalid FilterByShard parameter: %v", filter) 321 } 322 323 keyspace := parts[0] 324 shard := parts[1] 325 326 // extract keyrange if it's a range 327 canonical, kr, err := topo.ValidateShardName(shard) 328 if err != nil { 329 return nil, fmt.Errorf("error parsing shard name %v: %v", shard, err) 330 } 331 332 // check for duplicates 333 for _, c := range m[keyspace] { 334 if c.shard == canonical { 335 return nil, fmt.Errorf("duplicate %v/%v entry", keyspace, shard) 336 } 337 } 338 339 m[keyspace] = append(m[keyspace], &filterShard{ 340 keyspace: keyspace, 341 shard: canonical, 342 keyRange: kr, 343 }) 344 } 345 346 return &FilterByShard{ 347 filters: m, 348 }, nil 349 } 350 351 // IsIncluded returns true iff the tablet's keyspace and shard should be 352 // forwarded to the underlying LegacyTabletRecorder. 353 func (fbs *FilterByShard) IsIncluded(tablet *topodata.Tablet) bool { 354 canonical, kr, err := topo.ValidateShardName(tablet.Shard) 355 if err != nil { 356 log.Errorf("Error parsing shard name %v, will ignore tablet: %v", tablet.Shard, err) 357 return false 358 } 359 360 for _, c := range fbs.filters[tablet.Keyspace] { 361 if canonical == c.shard { 362 // Exact match (probably a non-sharded keyspace). 363 return true 364 } 365 if kr != nil && c.keyRange != nil && key.KeyRangeIncludes(c.keyRange, kr) { 366 // Our filter's KeyRange includes the provided KeyRange 367 return true 368 } 369 } 370 return false 371 } 372 373 // FilterByKeyspace is a filter that filters tablets by 374 // keyspace 375 type FilterByKeyspace struct { 376 keyspaces map[string]bool 377 } 378 379 // NewFilterByKeyspace creates a new FilterByKeyspace. 380 // Each filter is a keyspace entry. All tablets that match 381 // a keyspace will be forwarded to the underlying LegacyTabletRecorder. 382 func NewFilterByKeyspace(selectedKeyspaces []string) *FilterByKeyspace { 383 m := make(map[string]bool) 384 for _, keyspace := range selectedKeyspaces { 385 m[keyspace] = true 386 } 387 388 return &FilterByKeyspace{ 389 keyspaces: m, 390 } 391 } 392 393 // IsIncluded returns true if the tablet's keyspace should be 394 // forwarded to the underlying LegacyTabletRecorder. 395 func (fbk *FilterByKeyspace) IsIncluded(tablet *topodata.Tablet) bool { 396 _, exist := fbk.keyspaces[tablet.Keyspace] 397 return exist 398 }