github.com/tonistiigi/docker@v0.10.1-0.20240229224939-974013b0dc6a/libnetwork/agent.go (about)

     1  package libnetwork
     2  
     3  //go:generate protoc -I=. -I=../vendor/ --gogofaster_out=import_path=github.com/docker/docker/libnetwork:. agent.proto
     4  
     5  import (
     6  	"context"
     7  	"encoding/json"
     8  	"fmt"
     9  	"net"
    10  	"sort"
    11  	"sync"
    12  
    13  	"github.com/containerd/log"
    14  	"github.com/docker/docker/libnetwork/cluster"
    15  	"github.com/docker/docker/libnetwork/discoverapi"
    16  	"github.com/docker/docker/libnetwork/driverapi"
    17  	"github.com/docker/docker/libnetwork/networkdb"
    18  	"github.com/docker/docker/libnetwork/scope"
    19  	"github.com/docker/docker/libnetwork/types"
    20  	"github.com/docker/go-events"
    21  	"github.com/gogo/protobuf/proto"
    22  )
    23  
    24  const (
    25  	subsysGossip = "networking:gossip"
    26  	subsysIPSec  = "networking:ipsec"
    27  	keyringSize  = 3
    28  )
    29  
    30  // ByTime implements sort.Interface for []*types.EncryptionKey based on
    31  // the LamportTime field.
    32  type ByTime []*types.EncryptionKey
    33  
    34  func (b ByTime) Len() int           { return len(b) }
    35  func (b ByTime) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
    36  func (b ByTime) Less(i, j int) bool { return b[i].LamportTime < b[j].LamportTime }
    37  
    38  type nwAgent struct {
    39  	networkDB         *networkdb.NetworkDB
    40  	bindAddr          net.IP
    41  	advertiseAddr     string
    42  	dataPathAddr      string
    43  	coreCancelFuncs   []func()
    44  	driverCancelFuncs map[string][]func()
    45  	mu                sync.Mutex
    46  }
    47  
    48  func (a *nwAgent) dataPathAddress() string {
    49  	a.mu.Lock()
    50  	defer a.mu.Unlock()
    51  	if a.dataPathAddr != "" {
    52  		return a.dataPathAddr
    53  	}
    54  	return a.advertiseAddr
    55  }
    56  
    57  const libnetworkEPTable = "endpoint_table"
    58  
    59  func getBindAddr(ifaceName string) (net.IP, error) {
    60  	iface, err := net.InterfaceByName(ifaceName)
    61  	if err != nil {
    62  		return nil, fmt.Errorf("failed to find interface %s: %v", ifaceName, err)
    63  	}
    64  
    65  	addrs, err := iface.Addrs()
    66  	if err != nil {
    67  		return nil, fmt.Errorf("failed to get interface addresses: %v", err)
    68  	}
    69  
    70  	for _, a := range addrs {
    71  		addr, ok := a.(*net.IPNet)
    72  		if !ok {
    73  			continue
    74  		}
    75  		addrIP := addr.IP
    76  
    77  		if addrIP.IsLinkLocalUnicast() {
    78  			continue
    79  		}
    80  
    81  		return addrIP, nil
    82  	}
    83  
    84  	return nil, fmt.Errorf("failed to get bind address")
    85  }
    86  
    87  // resolveAddr resolves the given address, which can be one of, and
    88  // parsed in the following order or priority:
    89  //
    90  // - a well-formed IP-address
    91  // - a hostname
    92  // - an interface-name
    93  func resolveAddr(addrOrInterface string) (net.IP, error) {
    94  	// Try and see if this is a valid IP address
    95  	if ip := net.ParseIP(addrOrInterface); ip != nil {
    96  		return ip, nil
    97  	}
    98  
    99  	// If not a valid IP address, it could be a hostname.
   100  	addr, err := net.ResolveIPAddr("ip", addrOrInterface)
   101  	if err != nil {
   102  		// If hostname lookup failed, try to look for an interface with the given name.
   103  		return getBindAddr(addrOrInterface)
   104  	}
   105  	return addr.IP, nil
   106  }
   107  
   108  func (c *Controller) handleKeyChange(keys []*types.EncryptionKey) error {
   109  	drvEnc := discoverapi.DriverEncryptionUpdate{}
   110  
   111  	agent := c.getAgent()
   112  	if agent == nil {
   113  		log.G(context.TODO()).Debug("Skipping key change as agent is nil")
   114  		return nil
   115  	}
   116  
   117  	// Find the deleted key. If the deleted key was the primary key,
   118  	// a new primary key should be set before removing if from keyring.
   119  	c.mu.Lock()
   120  	added := []byte{}
   121  	deleted := []byte{}
   122  	j := len(c.keys)
   123  	for i := 0; i < j; {
   124  		same := false
   125  		for _, key := range keys {
   126  			if same = key.LamportTime == c.keys[i].LamportTime; same {
   127  				break
   128  			}
   129  		}
   130  		if !same {
   131  			cKey := c.keys[i]
   132  			if cKey.Subsystem == subsysGossip {
   133  				deleted = cKey.Key
   134  			}
   135  
   136  			if cKey.Subsystem == subsysIPSec {
   137  				drvEnc.Prune = cKey.Key
   138  				drvEnc.PruneTag = cKey.LamportTime
   139  			}
   140  			c.keys[i], c.keys[j-1] = c.keys[j-1], c.keys[i]
   141  			c.keys[j-1] = nil
   142  			j--
   143  		}
   144  		i++
   145  	}
   146  	c.keys = c.keys[:j]
   147  
   148  	// Find the new key and add it to the key ring
   149  	for _, key := range keys {
   150  		same := false
   151  		for _, cKey := range c.keys {
   152  			if same = cKey.LamportTime == key.LamportTime; same {
   153  				break
   154  			}
   155  		}
   156  		if !same {
   157  			c.keys = append(c.keys, key)
   158  			if key.Subsystem == subsysGossip {
   159  				added = key.Key
   160  			}
   161  
   162  			if key.Subsystem == subsysIPSec {
   163  				drvEnc.Key = key.Key
   164  				drvEnc.Tag = key.LamportTime
   165  			}
   166  		}
   167  	}
   168  	c.mu.Unlock()
   169  
   170  	if len(added) > 0 {
   171  		agent.networkDB.SetKey(added)
   172  	}
   173  
   174  	key, _, err := c.getPrimaryKeyTag(subsysGossip)
   175  	if err != nil {
   176  		return err
   177  	}
   178  	agent.networkDB.SetPrimaryKey(key)
   179  
   180  	key, tag, err := c.getPrimaryKeyTag(subsysIPSec)
   181  	if err != nil {
   182  		return err
   183  	}
   184  	drvEnc.Primary = key
   185  	drvEnc.PrimaryTag = tag
   186  
   187  	if len(deleted) > 0 {
   188  		agent.networkDB.RemoveKey(deleted)
   189  	}
   190  
   191  	c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
   192  		dr, ok := driver.(discoverapi.Discover)
   193  		if !ok {
   194  			return false
   195  		}
   196  		if err := dr.DiscoverNew(discoverapi.EncryptionKeysUpdate, drvEnc); err != nil {
   197  			log.G(context.TODO()).Warnf("Failed to update datapath keys in driver %s: %v", name, err)
   198  			// Attempt to reconfigure keys in case of a update failure
   199  			// which can arise due to a mismatch of keys
   200  			// if worker nodes get temporarily disconnected
   201  			log.G(context.TODO()).Warnf("Reconfiguring datapath keys for  %s", name)
   202  			drvCfgEnc := discoverapi.DriverEncryptionConfig{}
   203  			drvCfgEnc.Keys, drvCfgEnc.Tags = c.getKeys(subsysIPSec)
   204  			err = dr.DiscoverNew(discoverapi.EncryptionKeysConfig, drvCfgEnc)
   205  			if err != nil {
   206  				log.G(context.TODO()).Warnf("Failed to reset datapath keys in driver %s: %v", name, err)
   207  			}
   208  		}
   209  		return false
   210  	})
   211  
   212  	return nil
   213  }
   214  
   215  func (c *Controller) agentSetup(clusterProvider cluster.Provider) error {
   216  	agent := c.getAgent()
   217  	if agent != nil {
   218  		// agent is already present, so there is no need initialize it again.
   219  		return nil
   220  	}
   221  
   222  	bindAddr := clusterProvider.GetLocalAddress()
   223  	advAddr := clusterProvider.GetAdvertiseAddress()
   224  	dataAddr := clusterProvider.GetDataPathAddress()
   225  	remoteList := clusterProvider.GetRemoteAddressList()
   226  	remoteAddrList := make([]string, 0, len(remoteList))
   227  	for _, remote := range remoteList {
   228  		addr, _, _ := net.SplitHostPort(remote)
   229  		remoteAddrList = append(remoteAddrList, addr)
   230  	}
   231  
   232  	listen := clusterProvider.GetListenAddress()
   233  	listenAddr, _, _ := net.SplitHostPort(listen)
   234  
   235  	log.G(context.TODO()).WithFields(log.Fields{
   236  		"listen-addr":               listenAddr,
   237  		"local-addr":                bindAddr,
   238  		"advertise-addr":            advAddr,
   239  		"data-path-addr":            dataAddr,
   240  		"remote-addr-list":          remoteAddrList,
   241  		"network-control-plane-mtu": c.Config().NetworkControlPlaneMTU,
   242  	}).Info("Initializing Libnetwork Agent")
   243  	if advAddr != "" {
   244  		if err := c.agentInit(listenAddr, bindAddr, advAddr, dataAddr); err != nil {
   245  			log.G(context.TODO()).WithError(err).Errorf("Error in agentInit")
   246  			return err
   247  		}
   248  		c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
   249  			if capability.ConnectivityScope == scope.Global {
   250  				if d, ok := driver.(discoverapi.Discover); ok {
   251  					c.agentDriverNotify(d)
   252  				}
   253  			}
   254  			return false
   255  		})
   256  	}
   257  
   258  	if len(remoteAddrList) > 0 {
   259  		if err := c.agentJoin(remoteAddrList); err != nil {
   260  			log.G(context.TODO()).WithError(err).Error("Error in joining gossip cluster: join will be retried in background")
   261  		}
   262  	}
   263  
   264  	return nil
   265  }
   266  
   267  // For a given subsystem getKeys sorts the keys by lamport time and returns
   268  // slice of keys and lamport time which can used as a unique tag for the keys
   269  func (c *Controller) getKeys(subsystem string) (keys [][]byte, tags []uint64) {
   270  	c.mu.Lock()
   271  	defer c.mu.Unlock()
   272  
   273  	sort.Sort(ByTime(c.keys))
   274  
   275  	keys = make([][]byte, 0, len(c.keys))
   276  	tags = make([]uint64, 0, len(c.keys))
   277  	for _, key := range c.keys {
   278  		if key.Subsystem == subsystem {
   279  			keys = append(keys, key.Key)
   280  			tags = append(tags, key.LamportTime)
   281  		}
   282  	}
   283  
   284  	if len(keys) > 1 {
   285  		// TODO(thaJeztah): why are we swapping order here? This code was added in https://github.com/moby/libnetwork/commit/e83d68b7d1fd9c479120914024242238f791b4dc
   286  		keys[0], keys[1] = keys[1], keys[0]
   287  		tags[0], tags[1] = tags[1], tags[0]
   288  	}
   289  	return keys, tags
   290  }
   291  
   292  // getPrimaryKeyTag returns the primary key for a given subsystem from the
   293  // list of sorted key and the associated tag
   294  func (c *Controller) getPrimaryKeyTag(subsystem string) (key []byte, lamportTime uint64, _ error) {
   295  	c.mu.Lock()
   296  	defer c.mu.Unlock()
   297  	sort.Sort(ByTime(c.keys))
   298  	keys := make([]*types.EncryptionKey, 0, len(c.keys))
   299  	for _, k := range c.keys {
   300  		if k.Subsystem == subsystem {
   301  			keys = append(keys, k)
   302  		}
   303  	}
   304  	if len(keys) < 2 {
   305  		return nil, 0, fmt.Errorf("no primary key found for %s subsystem: %d keys found on controller, expected at least 2", subsystem, len(keys))
   306  	}
   307  	return keys[1].Key, keys[1].LamportTime, nil
   308  }
   309  
   310  func (c *Controller) agentInit(listenAddr, bindAddrOrInterface, advertiseAddr, dataPathAddr string) error {
   311  	bindAddr, err := resolveAddr(bindAddrOrInterface)
   312  	if err != nil {
   313  		return err
   314  	}
   315  
   316  	keys, _ := c.getKeys(subsysGossip)
   317  
   318  	netDBConf := networkdb.DefaultConfig()
   319  	netDBConf.BindAddr = listenAddr
   320  	netDBConf.AdvertiseAddr = advertiseAddr
   321  	netDBConf.Keys = keys
   322  	if c.Config().NetworkControlPlaneMTU != 0 {
   323  		// Consider the MTU remove the IP hdr (IPv4 or IPv6) and the TCP/UDP hdr.
   324  		// To be on the safe side let's cut 100 bytes
   325  		netDBConf.PacketBufferSize = (c.Config().NetworkControlPlaneMTU - 100)
   326  		log.G(context.TODO()).Debugf("Control plane MTU: %d will initialize NetworkDB with: %d",
   327  			c.Config().NetworkControlPlaneMTU, netDBConf.PacketBufferSize)
   328  	}
   329  	nDB, err := networkdb.New(netDBConf)
   330  	if err != nil {
   331  		return err
   332  	}
   333  
   334  	// Register the diagnostic handlers
   335  	nDB.RegisterDiagnosticHandlers(c.DiagnosticServer)
   336  
   337  	var cancelList []func()
   338  	ch, cancel := nDB.Watch(libnetworkEPTable, "")
   339  	cancelList = append(cancelList, cancel)
   340  	nodeCh, cancel := nDB.Watch(networkdb.NodeTable, "")
   341  	cancelList = append(cancelList, cancel)
   342  
   343  	c.mu.Lock()
   344  	c.agent = &nwAgent{
   345  		networkDB:         nDB,
   346  		bindAddr:          bindAddr,
   347  		advertiseAddr:     advertiseAddr,
   348  		dataPathAddr:      dataPathAddr,
   349  		coreCancelFuncs:   cancelList,
   350  		driverCancelFuncs: make(map[string][]func()),
   351  	}
   352  	c.mu.Unlock()
   353  
   354  	go c.handleTableEvents(ch, c.handleEpTableEvent)
   355  	go c.handleTableEvents(nodeCh, c.handleNodeTableEvent)
   356  
   357  	keys, tags := c.getKeys(subsysIPSec)
   358  	c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
   359  		if dr, ok := driver.(discoverapi.Discover); ok {
   360  			if err := dr.DiscoverNew(discoverapi.EncryptionKeysConfig, discoverapi.DriverEncryptionConfig{
   361  				Keys: keys,
   362  				Tags: tags,
   363  			}); err != nil {
   364  				log.G(context.TODO()).Warnf("Failed to set datapath keys in driver %s: %v", name, err)
   365  			}
   366  		}
   367  		return false
   368  	})
   369  
   370  	c.WalkNetworks(joinCluster)
   371  
   372  	return nil
   373  }
   374  
   375  func (c *Controller) agentJoin(remoteAddrList []string) error {
   376  	agent := c.getAgent()
   377  	if agent == nil {
   378  		return nil
   379  	}
   380  	return agent.networkDB.Join(remoteAddrList)
   381  }
   382  
   383  func (c *Controller) agentDriverNotify(d discoverapi.Discover) {
   384  	agent := c.getAgent()
   385  	if agent == nil {
   386  		return
   387  	}
   388  
   389  	if err := d.DiscoverNew(discoverapi.NodeDiscovery, discoverapi.NodeDiscoveryData{
   390  		Address:     agent.dataPathAddress(),
   391  		BindAddress: agent.bindAddr.String(),
   392  		Self:        true,
   393  	}); err != nil {
   394  		log.G(context.TODO()).Warnf("Failed the node discovery in driver: %v", err)
   395  	}
   396  
   397  	keys, tags := c.getKeys(subsysIPSec)
   398  	if err := d.DiscoverNew(discoverapi.EncryptionKeysConfig, discoverapi.DriverEncryptionConfig{
   399  		Keys: keys,
   400  		Tags: tags,
   401  	}); err != nil {
   402  		log.G(context.TODO()).Warnf("Failed to set datapath keys in driver: %v", err)
   403  	}
   404  }
   405  
   406  func (c *Controller) agentClose() {
   407  	// Acquire current agent instance and reset its pointer
   408  	// then run closing functions
   409  	c.mu.Lock()
   410  	agent := c.agent
   411  	c.agent = nil
   412  	c.mu.Unlock()
   413  
   414  	// when the agent is closed the cluster provider should be cleaned up
   415  	c.SetClusterProvider(nil)
   416  
   417  	if agent == nil {
   418  		return
   419  	}
   420  
   421  	var cancelList []func()
   422  
   423  	agent.mu.Lock()
   424  	for _, cancelFuncs := range agent.driverCancelFuncs {
   425  		cancelList = append(cancelList, cancelFuncs...)
   426  	}
   427  
   428  	// Add also the cancel functions for the network db
   429  	cancelList = append(cancelList, agent.coreCancelFuncs...)
   430  	agent.mu.Unlock()
   431  
   432  	for _, cancel := range cancelList {
   433  		cancel()
   434  	}
   435  
   436  	agent.networkDB.Close()
   437  }
   438  
   439  // Task has the backend container details
   440  type Task struct {
   441  	Name       string
   442  	EndpointID string
   443  	EndpointIP string
   444  	Info       map[string]string
   445  }
   446  
   447  // ServiceInfo has service specific details along with the list of backend tasks
   448  type ServiceInfo struct {
   449  	VIP          string
   450  	LocalLBIndex int
   451  	Tasks        []Task
   452  	Ports        []string
   453  }
   454  
   455  type epRecord struct {
   456  	ep      EndpointRecord
   457  	info    map[string]string
   458  	lbIndex int
   459  }
   460  
   461  // Services returns a map of services keyed by the service name with the details
   462  // of all the tasks that belong to the service. Applicable only in swarm mode.
   463  func (n *Network) Services() map[string]ServiceInfo {
   464  	agent, ok := n.clusterAgent()
   465  	if !ok {
   466  		return nil
   467  	}
   468  	nwID := n.ID()
   469  	d, err := n.driver(true)
   470  	if err != nil {
   471  		log.G(context.TODO()).Errorf("Could not resolve driver for network %s/%s while fetching services: %v", n.networkType, nwID, err)
   472  		return nil
   473  	}
   474  
   475  	// Walk through libnetworkEPTable and fetch the driver agnostic endpoint info
   476  	eps := make(map[string]epRecord)
   477  	c := n.getController()
   478  	for eid, value := range agent.networkDB.GetTableByNetwork(libnetworkEPTable, nwID) {
   479  		var epRec EndpointRecord
   480  		if err := proto.Unmarshal(value.Value, &epRec); err != nil {
   481  			log.G(context.TODO()).Errorf("Unmarshal of libnetworkEPTable failed for endpoint %s in network %s, %v", eid, nwID, err)
   482  			continue
   483  		}
   484  		eps[eid] = epRecord{
   485  			ep:      epRec,
   486  			lbIndex: c.getLBIndex(epRec.ServiceID, nwID, epRec.IngressPorts),
   487  		}
   488  	}
   489  
   490  	// Walk through the driver's tables, have the driver decode the entries
   491  	// and return the tuple {ep ID, value}. value is a string that coveys
   492  	// relevant info about the endpoint.
   493  	for _, table := range n.driverTables {
   494  		if table.objType != driverapi.EndpointObject {
   495  			continue
   496  		}
   497  		for key, value := range agent.networkDB.GetTableByNetwork(table.name, nwID) {
   498  			epID, info := d.DecodeTableEntry(table.name, key, value.Value)
   499  			if ep, ok := eps[epID]; !ok {
   500  				log.G(context.TODO()).Errorf("Inconsistent driver and libnetwork state for endpoint %s", epID)
   501  			} else {
   502  				ep.info = info
   503  				eps[epID] = ep
   504  			}
   505  		}
   506  	}
   507  
   508  	// group the endpoints into a map keyed by the service name
   509  	sinfo := make(map[string]ServiceInfo)
   510  	for ep, epr := range eps {
   511  		s, ok := sinfo[epr.ep.ServiceName]
   512  		if !ok {
   513  			s = ServiceInfo{
   514  				VIP:          epr.ep.VirtualIP,
   515  				LocalLBIndex: epr.lbIndex,
   516  			}
   517  		}
   518  		if s.Ports == nil {
   519  			ports := make([]string, 0, len(epr.ep.IngressPorts))
   520  			for _, port := range epr.ep.IngressPorts {
   521  				ports = append(ports, fmt.Sprintf("Target: %d, Publish: %d", port.TargetPort, port.PublishedPort))
   522  			}
   523  			s.Ports = ports
   524  		}
   525  		s.Tasks = append(s.Tasks, Task{
   526  			Name:       epr.ep.Name,
   527  			EndpointID: ep,
   528  			EndpointIP: epr.ep.EndpointIP,
   529  			Info:       epr.info,
   530  		})
   531  		sinfo[epr.ep.ServiceName] = s
   532  	}
   533  	return sinfo
   534  }
   535  
   536  // clusterAgent returns the cluster agent if the network is a swarm-scoped,
   537  // multi-host network.
   538  func (n *Network) clusterAgent() (agent *nwAgent, ok bool) {
   539  	if n.scope != scope.Swarm || !n.driverIsMultihost() {
   540  		return nil, false
   541  	}
   542  	a := n.getController().getAgent()
   543  	return a, a != nil
   544  }
   545  
   546  func (n *Network) joinCluster() error {
   547  	agent, ok := n.clusterAgent()
   548  	if !ok {
   549  		return nil
   550  	}
   551  	return agent.networkDB.JoinNetwork(n.ID())
   552  }
   553  
   554  func (n *Network) leaveCluster() error {
   555  	agent, ok := n.clusterAgent()
   556  	if !ok {
   557  		return nil
   558  	}
   559  	return agent.networkDB.LeaveNetwork(n.ID())
   560  }
   561  
   562  func (ep *Endpoint) addDriverInfoToCluster() error {
   563  	if ep.joinInfo == nil || len(ep.joinInfo.driverTableEntries) == 0 {
   564  		return nil
   565  	}
   566  	n := ep.getNetwork()
   567  	agent, ok := n.clusterAgent()
   568  	if !ok {
   569  		return nil
   570  	}
   571  
   572  	nwID := n.ID()
   573  	for _, te := range ep.joinInfo.driverTableEntries {
   574  		if err := agent.networkDB.CreateEntry(te.tableName, nwID, te.key, te.value); err != nil {
   575  			return err
   576  		}
   577  	}
   578  	return nil
   579  }
   580  
   581  func (ep *Endpoint) deleteDriverInfoFromCluster() error {
   582  	if ep.joinInfo == nil || len(ep.joinInfo.driverTableEntries) == 0 {
   583  		return nil
   584  	}
   585  	n := ep.getNetwork()
   586  	agent, ok := n.clusterAgent()
   587  	if !ok {
   588  		return nil
   589  	}
   590  
   591  	nwID := n.ID()
   592  	for _, te := range ep.joinInfo.driverTableEntries {
   593  		if err := agent.networkDB.DeleteEntry(te.tableName, nwID, te.key); err != nil {
   594  			return err
   595  		}
   596  	}
   597  	return nil
   598  }
   599  
   600  func (ep *Endpoint) addServiceInfoToCluster(sb *Sandbox) error {
   601  	if len(ep.dnsNames) == 0 || ep.Iface() == nil || ep.Iface().Address() == nil {
   602  		return nil
   603  	}
   604  
   605  	n := ep.getNetwork()
   606  	agent, ok := n.clusterAgent()
   607  	if !ok {
   608  		return nil
   609  	}
   610  
   611  	sb.service.Lock()
   612  	defer sb.service.Unlock()
   613  	log.G(context.TODO()).Debugf("addServiceInfoToCluster START for %s %s", ep.svcName, ep.ID())
   614  
   615  	// Check that the endpoint is still present on the sandbox before adding it to the service discovery.
   616  	// This is to handle a race between the EnableService and the sbLeave
   617  	// It is possible that the EnableService starts, fetches the list of the endpoints and
   618  	// by the time the addServiceInfoToCluster is called the endpoint got removed from the sandbox
   619  	// The risk is that the deleteServiceInfoToCluster happens before the addServiceInfoToCluster.
   620  	// This check under the Service lock of the sandbox ensure the correct behavior.
   621  	// If the addServiceInfoToCluster arrives first may find or not the endpoint and will proceed or exit
   622  	// but in any case the deleteServiceInfoToCluster will follow doing the cleanup if needed.
   623  	// In case the deleteServiceInfoToCluster arrives first, this one is happening after the endpoint is
   624  	// removed from the list, in this situation the delete will bail out not finding any data to cleanup
   625  	// and the add will bail out not finding the endpoint on the sandbox.
   626  	if err := sb.GetEndpoint(ep.ID()); err == nil {
   627  		log.G(context.TODO()).Warnf("addServiceInfoToCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
   628  		return nil
   629  	}
   630  
   631  	dnsNames := ep.getDNSNames()
   632  	primaryDNSName, dnsAliases := dnsNames[0], dnsNames[1:]
   633  
   634  	var ingressPorts []*PortConfig
   635  	if ep.svcID != "" {
   636  		// This is a task part of a service
   637  		// Gossip ingress ports only in ingress network.
   638  		if n.ingress {
   639  			ingressPorts = ep.ingressPorts
   640  		}
   641  		if err := n.getController().addServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), primaryDNSName, ep.virtualIP, ingressPorts, ep.svcAliases, dnsAliases, ep.Iface().Address().IP, "addServiceInfoToCluster"); err != nil {
   642  			return err
   643  		}
   644  	} else {
   645  		// This is a container simply attached to an attachable network
   646  		if err := n.getController().addContainerNameResolution(n.ID(), ep.ID(), primaryDNSName, dnsAliases, ep.Iface().Address().IP, "addServiceInfoToCluster"); err != nil {
   647  			return err
   648  		}
   649  	}
   650  
   651  	buf, err := proto.Marshal(&EndpointRecord{
   652  		Name:            primaryDNSName,
   653  		ServiceName:     ep.svcName,
   654  		ServiceID:       ep.svcID,
   655  		VirtualIP:       ep.virtualIP.String(),
   656  		IngressPorts:    ingressPorts,
   657  		Aliases:         ep.svcAliases,
   658  		TaskAliases:     dnsAliases,
   659  		EndpointIP:      ep.Iface().Address().IP.String(),
   660  		ServiceDisabled: false,
   661  	})
   662  	if err != nil {
   663  		return err
   664  	}
   665  
   666  	if err := agent.networkDB.CreateEntry(libnetworkEPTable, n.ID(), ep.ID(), buf); err != nil {
   667  		log.G(context.TODO()).Warnf("addServiceInfoToCluster NetworkDB CreateEntry failed for %s %s err:%s", ep.id, n.id, err)
   668  		return err
   669  	}
   670  
   671  	log.G(context.TODO()).Debugf("addServiceInfoToCluster END for %s %s", ep.svcName, ep.ID())
   672  
   673  	return nil
   674  }
   675  
   676  func (ep *Endpoint) deleteServiceInfoFromCluster(sb *Sandbox, fullRemove bool, method string) error {
   677  	if len(ep.dnsNames) == 0 {
   678  		return nil
   679  	}
   680  
   681  	n := ep.getNetwork()
   682  	agent, ok := n.clusterAgent()
   683  	if !ok {
   684  		return nil
   685  	}
   686  
   687  	sb.service.Lock()
   688  	defer sb.service.Unlock()
   689  	log.G(context.TODO()).Debugf("deleteServiceInfoFromCluster from %s START for %s %s", method, ep.svcName, ep.ID())
   690  
   691  	// Avoid a race w/ with a container that aborts preemptively.  This would
   692  	// get caught in disableServceInNetworkDB, but we check here to make the
   693  	// nature of the condition more clear.
   694  	// See comment in addServiceInfoToCluster()
   695  	if err := sb.GetEndpoint(ep.ID()); err == nil {
   696  		log.G(context.TODO()).Warnf("deleteServiceInfoFromCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
   697  		return nil
   698  	}
   699  
   700  	dnsNames := ep.getDNSNames()
   701  	primaryDNSName, dnsAliases := dnsNames[0], dnsNames[1:]
   702  
   703  	// First update the networkDB then locally
   704  	if fullRemove {
   705  		if err := agent.networkDB.DeleteEntry(libnetworkEPTable, n.ID(), ep.ID()); err != nil {
   706  			log.G(context.TODO()).Warnf("deleteServiceInfoFromCluster NetworkDB DeleteEntry failed for %s %s err:%s", ep.id, n.id, err)
   707  		}
   708  	} else {
   709  		disableServiceInNetworkDB(agent, n, ep)
   710  	}
   711  
   712  	if ep.Iface() != nil && ep.Iface().Address() != nil {
   713  		if ep.svcID != "" {
   714  			// This is a task part of a service
   715  			var ingressPorts []*PortConfig
   716  			if n.ingress {
   717  				ingressPorts = ep.ingressPorts
   718  			}
   719  			if err := n.getController().rmServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), primaryDNSName, ep.virtualIP, ingressPorts, ep.svcAliases, dnsAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster", true, fullRemove); err != nil {
   720  				return err
   721  			}
   722  		} else {
   723  			// This is a container simply attached to an attachable network
   724  			if err := n.getController().delContainerNameResolution(n.ID(), ep.ID(), primaryDNSName, dnsAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster"); err != nil {
   725  				return err
   726  			}
   727  		}
   728  	}
   729  
   730  	log.G(context.TODO()).Debugf("deleteServiceInfoFromCluster from %s END for %s %s", method, ep.svcName, ep.ID())
   731  
   732  	return nil
   733  }
   734  
   735  func disableServiceInNetworkDB(a *nwAgent, n *Network, ep *Endpoint) {
   736  	var epRec EndpointRecord
   737  
   738  	log.G(context.TODO()).Debugf("disableServiceInNetworkDB for %s %s", ep.svcName, ep.ID())
   739  
   740  	// Update existing record to indicate that the service is disabled
   741  	inBuf, err := a.networkDB.GetEntry(libnetworkEPTable, n.ID(), ep.ID())
   742  	if err != nil {
   743  		log.G(context.TODO()).Warnf("disableServiceInNetworkDB GetEntry failed for %s %s err:%s", ep.id, n.id, err)
   744  		return
   745  	}
   746  	// Should never fail
   747  	if err := proto.Unmarshal(inBuf, &epRec); err != nil {
   748  		log.G(context.TODO()).Errorf("disableServiceInNetworkDB unmarshal failed for %s %s err:%s", ep.id, n.id, err)
   749  		return
   750  	}
   751  	epRec.ServiceDisabled = true
   752  	// Should never fail
   753  	outBuf, err := proto.Marshal(&epRec)
   754  	if err != nil {
   755  		log.G(context.TODO()).Errorf("disableServiceInNetworkDB marshalling failed for %s %s err:%s", ep.id, n.id, err)
   756  		return
   757  	}
   758  	// Send update to the whole cluster
   759  	if err := a.networkDB.UpdateEntry(libnetworkEPTable, n.ID(), ep.ID(), outBuf); err != nil {
   760  		log.G(context.TODO()).Warnf("disableServiceInNetworkDB UpdateEntry failed for %s %s err:%s", ep.id, n.id, err)
   761  	}
   762  }
   763  
   764  func (n *Network) addDriverWatches() {
   765  	if len(n.driverTables) == 0 {
   766  		return
   767  	}
   768  	agent, ok := n.clusterAgent()
   769  	if !ok {
   770  		return
   771  	}
   772  
   773  	c := n.getController()
   774  	for _, table := range n.driverTables {
   775  		ch, cancel := agent.networkDB.Watch(table.name, n.ID())
   776  		agent.mu.Lock()
   777  		agent.driverCancelFuncs[n.ID()] = append(agent.driverCancelFuncs[n.ID()], cancel)
   778  		agent.mu.Unlock()
   779  		go c.handleTableEvents(ch, n.handleDriverTableEvent)
   780  		d, err := n.driver(false)
   781  		if err != nil {
   782  			log.G(context.TODO()).Errorf("Could not resolve driver %s while walking driver tabl: %v", n.networkType, err)
   783  			return
   784  		}
   785  
   786  		err = agent.networkDB.WalkTable(table.name, func(nid, key string, value []byte, deleted bool) bool {
   787  			// skip the entries that are mark for deletion, this is safe because this function is
   788  			// called at initialization time so there is no state to delete
   789  			if nid == n.ID() && !deleted {
   790  				d.EventNotify(driverapi.Create, nid, table.name, key, value)
   791  			}
   792  			return false
   793  		})
   794  		if err != nil {
   795  			log.G(context.TODO()).WithError(err).Warn("Error while walking networkdb")
   796  		}
   797  	}
   798  }
   799  
   800  func (n *Network) cancelDriverWatches() {
   801  	agent, ok := n.clusterAgent()
   802  	if !ok {
   803  		return
   804  	}
   805  
   806  	agent.mu.Lock()
   807  	cancelFuncs := agent.driverCancelFuncs[n.ID()]
   808  	delete(agent.driverCancelFuncs, n.ID())
   809  	agent.mu.Unlock()
   810  
   811  	for _, cancel := range cancelFuncs {
   812  		cancel()
   813  	}
   814  }
   815  
   816  func (c *Controller) handleTableEvents(ch *events.Channel, fn func(events.Event)) {
   817  	for {
   818  		select {
   819  		case ev := <-ch.C:
   820  			fn(ev)
   821  		case <-ch.Done():
   822  			return
   823  		}
   824  	}
   825  }
   826  
   827  func (n *Network) handleDriverTableEvent(ev events.Event) {
   828  	d, err := n.driver(false)
   829  	if err != nil {
   830  		log.G(context.TODO()).Errorf("Could not resolve driver %s while handling driver table event: %v", n.networkType, err)
   831  		return
   832  	}
   833  
   834  	var (
   835  		etype driverapi.EventType
   836  		tname string
   837  		key   string
   838  		value []byte
   839  	)
   840  
   841  	switch event := ev.(type) {
   842  	case networkdb.CreateEvent:
   843  		tname = event.Table
   844  		key = event.Key
   845  		value = event.Value
   846  		etype = driverapi.Create
   847  	case networkdb.DeleteEvent:
   848  		tname = event.Table
   849  		key = event.Key
   850  		value = event.Value
   851  		etype = driverapi.Delete
   852  	case networkdb.UpdateEvent:
   853  		tname = event.Table
   854  		key = event.Key
   855  		value = event.Value
   856  		etype = driverapi.Delete
   857  	}
   858  
   859  	d.EventNotify(etype, n.ID(), tname, key, value)
   860  }
   861  
   862  func (c *Controller) handleNodeTableEvent(ev events.Event) {
   863  	var (
   864  		value    []byte
   865  		isAdd    bool
   866  		nodeAddr networkdb.NodeAddr
   867  	)
   868  	switch event := ev.(type) {
   869  	case networkdb.CreateEvent:
   870  		value = event.Value
   871  		isAdd = true
   872  	case networkdb.DeleteEvent:
   873  		value = event.Value
   874  	case networkdb.UpdateEvent:
   875  		log.G(context.TODO()).Errorf("Unexpected update node table event = %#v", event)
   876  	}
   877  
   878  	err := json.Unmarshal(value, &nodeAddr)
   879  	if err != nil {
   880  		log.G(context.TODO()).Errorf("Error unmarshalling node table event %v", err)
   881  		return
   882  	}
   883  	c.processNodeDiscovery([]net.IP{nodeAddr.Addr}, isAdd)
   884  }
   885  
   886  func (c *Controller) handleEpTableEvent(ev events.Event) {
   887  	var (
   888  		nid   string
   889  		eid   string
   890  		value []byte
   891  		epRec EndpointRecord
   892  	)
   893  
   894  	switch event := ev.(type) {
   895  	case networkdb.CreateEvent:
   896  		nid = event.NetworkID
   897  		eid = event.Key
   898  		value = event.Value
   899  	case networkdb.DeleteEvent:
   900  		nid = event.NetworkID
   901  		eid = event.Key
   902  		value = event.Value
   903  	case networkdb.UpdateEvent:
   904  		nid = event.NetworkID
   905  		eid = event.Key
   906  		value = event.Value
   907  	default:
   908  		log.G(context.TODO()).Errorf("Unexpected update service table event = %#v", event)
   909  		return
   910  	}
   911  
   912  	err := proto.Unmarshal(value, &epRec)
   913  	if err != nil {
   914  		log.G(context.TODO()).Errorf("Failed to unmarshal service table value: %v", err)
   915  		return
   916  	}
   917  
   918  	containerName := epRec.Name
   919  	svcName := epRec.ServiceName
   920  	svcID := epRec.ServiceID
   921  	vip := net.ParseIP(epRec.VirtualIP)
   922  	ip := net.ParseIP(epRec.EndpointIP)
   923  	ingressPorts := epRec.IngressPorts
   924  	serviceAliases := epRec.Aliases
   925  	taskAliases := epRec.TaskAliases
   926  
   927  	if containerName == "" || ip == nil {
   928  		log.G(context.TODO()).Errorf("Invalid endpoint name/ip received while handling service table event %s", value)
   929  		return
   930  	}
   931  
   932  	switch ev.(type) {
   933  	case networkdb.CreateEvent:
   934  		log.G(context.TODO()).Debugf("handleEpTableEvent ADD %s R:%v", eid, epRec)
   935  		if svcID != "" {
   936  			// This is a remote task part of a service
   937  			if err := c.addServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent"); err != nil {
   938  				log.G(context.TODO()).Errorf("failed adding service binding for %s epRec:%v err:%v", eid, epRec, err)
   939  				return
   940  			}
   941  		} else {
   942  			// This is a remote container simply attached to an attachable network
   943  			if err := c.addContainerNameResolution(nid, eid, containerName, taskAliases, ip, "handleEpTableEvent"); err != nil {
   944  				log.G(context.TODO()).Errorf("failed adding container name resolution for %s epRec:%v err:%v", eid, epRec, err)
   945  			}
   946  		}
   947  
   948  	case networkdb.DeleteEvent:
   949  		log.G(context.TODO()).Debugf("handleEpTableEvent DEL %s R:%v", eid, epRec)
   950  		if svcID != "" {
   951  			// This is a remote task part of a service
   952  			if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent", true, true); err != nil {
   953  				log.G(context.TODO()).Errorf("failed removing service binding for %s epRec:%v err:%v", eid, epRec, err)
   954  				return
   955  			}
   956  		} else {
   957  			// This is a remote container simply attached to an attachable network
   958  			if err := c.delContainerNameResolution(nid, eid, containerName, taskAliases, ip, "handleEpTableEvent"); err != nil {
   959  				log.G(context.TODO()).Errorf("failed removing container name resolution for %s epRec:%v err:%v", eid, epRec, err)
   960  			}
   961  		}
   962  	case networkdb.UpdateEvent:
   963  		log.G(context.TODO()).Debugf("handleEpTableEvent UPD %s R:%v", eid, epRec)
   964  		// We currently should only get these to inform us that an endpoint
   965  		// is disabled.  Report if otherwise.
   966  		if svcID == "" || !epRec.ServiceDisabled {
   967  			log.G(context.TODO()).Errorf("Unexpected update table event for %s epRec:%v", eid, epRec)
   968  			return
   969  		}
   970  		// This is a remote task that is part of a service that is now disabled
   971  		if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent", true, false); err != nil {
   972  			log.G(context.TODO()).Errorf("failed disabling service binding for %s epRec:%v err:%v", eid, epRec, err)
   973  			return
   974  		}
   975  	}
   976  }