github.imxd.top/hashicorp/consul@v1.4.5/agent/router/router.go (about) 1 package router 2 3 import ( 4 "fmt" 5 "log" 6 "sort" 7 "sync" 8 9 "github.com/hashicorp/consul/agent/metadata" 10 "github.com/hashicorp/consul/agent/structs" 11 "github.com/hashicorp/consul/lib" 12 "github.com/hashicorp/consul/types" 13 "github.com/hashicorp/serf/coordinate" 14 "github.com/hashicorp/serf/serf" 15 ) 16 17 // Router keeps track of a set of network areas and their associated Serf 18 // membership of Consul servers. It then indexes this by datacenter to provide 19 // healthy routes to servers by datacenter. 20 type Router struct { 21 // logger is used for diagnostic output. 22 logger *log.Logger 23 24 // localDatacenter has the name of the router's home datacenter. This is 25 // used to short-circuit RTT calculations for local servers. 26 localDatacenter string 27 28 // areas maps area IDs to structures holding information about that 29 // area. 30 areas map[types.AreaID]*areaInfo 31 32 // managers is an index from datacenter names to a list of server 33 // managers for that datacenter. This is used to quickly lookup routes. 34 managers map[string][]*Manager 35 36 // routeFn is a hook to actually do the routing. 37 routeFn func(datacenter string) (*Manager, *metadata.Server, bool) 38 39 // isShutdown prevents adding new routes to a router after it is shut 40 // down. 41 isShutdown bool 42 43 // This top-level lock covers all the internal state. 44 sync.RWMutex 45 } 46 47 // RouterSerfCluster is an interface wrapper around Serf in order to make this 48 // easier to unit test. 49 type RouterSerfCluster interface { 50 NumNodes() int 51 Members() []serf.Member 52 GetCoordinate() (*coordinate.Coordinate, error) 53 GetCachedCoordinate(name string) (*coordinate.Coordinate, bool) 54 } 55 56 // managerInfo holds a server manager for a datacenter along with its associated 57 // shutdown channel. 58 type managerInfo struct { 59 // manager is notified about servers for this datacenter. 60 manager *Manager 61 62 // shutdownCh is only given to this manager so we can shut it down when 63 // all servers for this datacenter are gone. 64 shutdownCh chan struct{} 65 } 66 67 // areaInfo holds information about a given network area. 68 type areaInfo struct { 69 // cluster is the Serf instance for this network area. 70 cluster RouterSerfCluster 71 72 // pinger is used to ping servers in this network area when trying to 73 // find a new, healthy server to talk to. 74 pinger Pinger 75 76 // managers maps datacenter names to managers for that datacenter in 77 // this area. 78 managers map[string]*managerInfo 79 80 // useTLS specifies whether to use TLS to communicate for this network area. 81 useTLS bool 82 } 83 84 // NewRouter returns a new Router with the given configuration. 85 func NewRouter(logger *log.Logger, localDatacenter string) *Router { 86 router := &Router{ 87 logger: logger, 88 localDatacenter: localDatacenter, 89 areas: make(map[types.AreaID]*areaInfo), 90 managers: make(map[string][]*Manager), 91 } 92 93 // Hook the direct route lookup by default. 94 router.routeFn = router.findDirectRoute 95 96 return router 97 } 98 99 // Shutdown removes all areas from the router, which stops all their respective 100 // managers. No new areas can be added after the router is shut down. 101 func (r *Router) Shutdown() { 102 r.Lock() 103 defer r.Unlock() 104 105 for areaID, area := range r.areas { 106 for datacenter, info := range area.managers { 107 r.removeManagerFromIndex(datacenter, info.manager) 108 close(info.shutdownCh) 109 } 110 111 delete(r.areas, areaID) 112 } 113 114 r.isShutdown = true 115 } 116 117 // AddArea registers a new network area with the router. 118 func (r *Router) AddArea(areaID types.AreaID, cluster RouterSerfCluster, pinger Pinger, useTLS bool) error { 119 r.Lock() 120 defer r.Unlock() 121 122 if r.isShutdown { 123 return fmt.Errorf("cannot add area, router is shut down") 124 } 125 126 if _, ok := r.areas[areaID]; ok { 127 return fmt.Errorf("area ID %q already exists", areaID) 128 } 129 130 area := &areaInfo{ 131 cluster: cluster, 132 pinger: pinger, 133 managers: make(map[string]*managerInfo), 134 useTLS: useTLS, 135 } 136 r.areas[areaID] = area 137 138 // Do an initial populate of the manager so that we don't have to wait 139 // for events to fire. This lets us attempt to use all the known servers 140 // initially, and then will quickly detect that they are failed if we 141 // can't reach them. 142 for _, m := range cluster.Members() { 143 ok, parts := metadata.IsConsulServer(m) 144 if !ok { 145 r.logger.Printf("[WARN]: consul: Non-server %q in server-only area %q", 146 m.Name, areaID) 147 continue 148 } 149 150 if err := r.addServer(area, parts); err != nil { 151 return fmt.Errorf("failed to add server %q to area %q: %v", m.Name, areaID, err) 152 } 153 } 154 155 return nil 156 } 157 158 // removeManagerFromIndex does cleanup to take a manager out of the index of 159 // datacenters. This assumes the lock is already held for writing, and will 160 // panic if the given manager isn't found. 161 func (r *Router) removeManagerFromIndex(datacenter string, manager *Manager) { 162 managers := r.managers[datacenter] 163 for i := 0; i < len(managers); i++ { 164 if managers[i] == manager { 165 r.managers[datacenter] = append(managers[:i], managers[i+1:]...) 166 if len(r.managers[datacenter]) == 0 { 167 delete(r.managers, datacenter) 168 } 169 return 170 } 171 } 172 panic("managers index out of sync") 173 } 174 175 // Returns whether TLS is enabled for the given area ID 176 func (r *Router) TLSEnabled(areaID types.AreaID) (bool, error) { 177 r.RLock() 178 defer r.RUnlock() 179 180 area, ok := r.areas[areaID] 181 if !ok { 182 return false, fmt.Errorf("area ID %q does not exist", areaID) 183 } 184 185 return area.useTLS, nil 186 } 187 188 // RemoveArea removes an existing network area from the router. 189 func (r *Router) RemoveArea(areaID types.AreaID) error { 190 r.Lock() 191 defer r.Unlock() 192 193 area, ok := r.areas[areaID] 194 if !ok { 195 return fmt.Errorf("area ID %q does not exist", areaID) 196 } 197 198 // Remove all of this area's managers from the index and shut them down. 199 for datacenter, info := range area.managers { 200 r.removeManagerFromIndex(datacenter, info.manager) 201 close(info.shutdownCh) 202 } 203 204 delete(r.areas, areaID) 205 return nil 206 } 207 208 // addServer does the work of AddServer once the write lock is held. 209 func (r *Router) addServer(area *areaInfo, s *metadata.Server) error { 210 // Make the manager on the fly if this is the first we've seen of it, 211 // and add it to the index. 212 info, ok := area.managers[s.Datacenter] 213 if !ok { 214 shutdownCh := make(chan struct{}) 215 manager := New(r.logger, shutdownCh, area.cluster, area.pinger) 216 info = &managerInfo{ 217 manager: manager, 218 shutdownCh: shutdownCh, 219 } 220 area.managers[s.Datacenter] = info 221 222 managers := r.managers[s.Datacenter] 223 r.managers[s.Datacenter] = append(managers, manager) 224 go manager.Start() 225 } 226 227 // If TLS is enabled for the area, set it on the server so the manager 228 // knows to use TLS when pinging it. 229 if area.useTLS { 230 s.UseTLS = true 231 } 232 233 info.manager.AddServer(s) 234 return nil 235 } 236 237 // AddServer should be called whenever a new server joins an area. This is 238 // typically hooked into the Serf event handler area for this area. 239 func (r *Router) AddServer(areaID types.AreaID, s *metadata.Server) error { 240 r.Lock() 241 defer r.Unlock() 242 243 area, ok := r.areas[areaID] 244 if !ok { 245 return fmt.Errorf("area ID %q does not exist", areaID) 246 } 247 return r.addServer(area, s) 248 } 249 250 // RemoveServer should be called whenever a server is removed from an area. This 251 // is typically hooked into the Serf event handler area for this area. 252 func (r *Router) RemoveServer(areaID types.AreaID, s *metadata.Server) error { 253 r.Lock() 254 defer r.Unlock() 255 256 area, ok := r.areas[areaID] 257 if !ok { 258 return fmt.Errorf("area ID %q does not exist", areaID) 259 } 260 261 // If the manager has already been removed we just quietly exit. This 262 // can get called by Serf events, so the timing isn't totally 263 // deterministic. 264 info, ok := area.managers[s.Datacenter] 265 if !ok { 266 return nil 267 } 268 info.manager.RemoveServer(s) 269 270 // If this manager is empty then remove it so we don't accumulate cruft 271 // and waste time during request routing. 272 if num := info.manager.NumServers(); num == 0 { 273 r.removeManagerFromIndex(s.Datacenter, info.manager) 274 close(info.shutdownCh) 275 delete(area.managers, s.Datacenter) 276 } 277 278 return nil 279 } 280 281 // FailServer should be called whenever a server is failed in an area. This 282 // is typically hooked into the Serf event handler area for this area. We will 283 // immediately shift traffic away from this server, but it will remain in the 284 // list of servers. 285 func (r *Router) FailServer(areaID types.AreaID, s *metadata.Server) error { 286 r.RLock() 287 defer r.RUnlock() 288 289 area, ok := r.areas[areaID] 290 if !ok { 291 return fmt.Errorf("area ID %q does not exist", areaID) 292 } 293 294 // If the manager has already been removed we just quietly exit. This 295 // can get called by Serf events, so the timing isn't totally 296 // deterministic. 297 info, ok := area.managers[s.Datacenter] 298 if !ok { 299 return nil 300 } 301 302 info.manager.NotifyFailedServer(s) 303 return nil 304 } 305 306 // FindRoute returns a healthy server with a route to the given datacenter. The 307 // Boolean return parameter will indicate if a server was available. In some 308 // cases this may return a best-effort unhealthy server that can be used for a 309 // connection attempt. If any problem occurs with the given server, the caller 310 // should feed that back to the manager associated with the server, which is 311 // also returned, by calling NotifyFailedServer(). 312 func (r *Router) FindRoute(datacenter string) (*Manager, *metadata.Server, bool) { 313 return r.routeFn(datacenter) 314 } 315 316 // findDirectRoute looks for a route to the given datacenter if it's directly 317 // adjacent to the server. 318 func (r *Router) findDirectRoute(datacenter string) (*Manager, *metadata.Server, bool) { 319 r.RLock() 320 defer r.RUnlock() 321 322 // Get the list of managers for this datacenter. This will usually just 323 // have one entry, but it's possible to have a user-defined area + WAN. 324 managers, ok := r.managers[datacenter] 325 if !ok { 326 return nil, nil, false 327 } 328 329 // Try each manager until we get a server. 330 for _, manager := range managers { 331 if manager.IsOffline() { 332 continue 333 } 334 335 if s := manager.FindServer(); s != nil { 336 return manager, s, true 337 } 338 } 339 340 // Didn't find a route (even via an unhealthy server). 341 return nil, nil, false 342 } 343 344 // GetDatacenters returns a list of datacenters known to the router, sorted by 345 // name. 346 func (r *Router) GetDatacenters() []string { 347 r.RLock() 348 defer r.RUnlock() 349 350 dcs := make([]string, 0, len(r.managers)) 351 for dc := range r.managers { 352 dcs = append(dcs, dc) 353 } 354 355 sort.Strings(dcs) 356 return dcs 357 } 358 359 // datacenterSorter takes a list of DC names and a parallel vector of distances 360 // and implements sort.Interface, keeping both structures coherent and sorting 361 // by distance. 362 type datacenterSorter struct { 363 Names []string 364 Vec []float64 365 } 366 367 // See sort.Interface. 368 func (n *datacenterSorter) Len() int { 369 return len(n.Names) 370 } 371 372 // See sort.Interface. 373 func (n *datacenterSorter) Swap(i, j int) { 374 n.Names[i], n.Names[j] = n.Names[j], n.Names[i] 375 n.Vec[i], n.Vec[j] = n.Vec[j], n.Vec[i] 376 } 377 378 // See sort.Interface. 379 func (n *datacenterSorter) Less(i, j int) bool { 380 return n.Vec[i] < n.Vec[j] 381 } 382 383 // GetDatacentersByDistance returns a list of datacenters known to the router, 384 // sorted by median RTT from this server to the servers in each datacenter. If 385 // there are multiple areas that reach a given datacenter, this will use the 386 // lowest RTT for the sort. 387 func (r *Router) GetDatacentersByDistance() ([]string, error) { 388 r.RLock() 389 defer r.RUnlock() 390 391 // Go through each area and aggregate the median RTT from the current 392 // server to the other servers in each datacenter. 393 dcs := make(map[string]float64) 394 for areaID, info := range r.areas { 395 index := make(map[string][]float64) 396 coord, err := info.cluster.GetCoordinate() 397 if err != nil { 398 return nil, err 399 } 400 401 for _, m := range info.cluster.Members() { 402 ok, parts := metadata.IsConsulServer(m) 403 if !ok { 404 r.logger.Printf("[WARN]: consul: Non-server %q in server-only area %q", 405 m.Name, areaID) 406 continue 407 } 408 409 existing := index[parts.Datacenter] 410 if parts.Datacenter == r.localDatacenter { 411 // Everything in the local datacenter looks like zero RTT. 412 index[parts.Datacenter] = append(existing, 0.0) 413 } else { 414 // It's OK to get a nil coordinate back, ComputeDistance 415 // will put the RTT at positive infinity. 416 other, _ := info.cluster.GetCachedCoordinate(parts.Name) 417 rtt := lib.ComputeDistance(coord, other) 418 index[parts.Datacenter] = append(existing, rtt) 419 } 420 } 421 422 // Compute the median RTT between this server and the servers 423 // in each datacenter. We accumulate the lowest RTT to each DC 424 // in the master map, since a given DC might appear in multiple 425 // areas. 426 for dc, rtts := range index { 427 sort.Float64s(rtts) 428 rtt := rtts[len(rtts)/2] 429 430 current, ok := dcs[dc] 431 if !ok || (ok && rtt < current) { 432 dcs[dc] = rtt 433 } 434 } 435 } 436 437 // First sort by DC name, since we do a stable sort later. 438 names := make([]string, 0, len(dcs)) 439 for dc := range dcs { 440 names = append(names, dc) 441 } 442 sort.Strings(names) 443 444 // Then stable sort by median RTT. 445 rtts := make([]float64, 0, len(dcs)) 446 for _, dc := range names { 447 rtts = append(rtts, dcs[dc]) 448 } 449 sort.Stable(&datacenterSorter{names, rtts}) 450 return names, nil 451 } 452 453 // GetDatacenterMaps returns a structure with the raw network coordinates of 454 // each known server, organized by datacenter and network area. 455 func (r *Router) GetDatacenterMaps() ([]structs.DatacenterMap, error) { 456 r.RLock() 457 defer r.RUnlock() 458 459 var maps []structs.DatacenterMap 460 for areaID, info := range r.areas { 461 index := make(map[string]structs.Coordinates) 462 for _, m := range info.cluster.Members() { 463 ok, parts := metadata.IsConsulServer(m) 464 if !ok { 465 r.logger.Printf("[WARN]: consul: Non-server %q in server-only area %q", 466 m.Name, areaID) 467 continue 468 } 469 470 coord, ok := info.cluster.GetCachedCoordinate(parts.Name) 471 if ok { 472 entry := &structs.Coordinate{ 473 Node: parts.Name, 474 Coord: coord, 475 } 476 existing := index[parts.Datacenter] 477 index[parts.Datacenter] = append(existing, entry) 478 } 479 } 480 481 for dc, coords := range index { 482 entry := structs.DatacenterMap{ 483 Datacenter: dc, 484 AreaID: areaID, 485 Coordinates: coords, 486 } 487 maps = append(maps, entry) 488 } 489 } 490 return maps, nil 491 }