github.com/keltia/go-ipfs@v0.3.8-0.20150909044612-210793031c63/p2p/net/swarm/swarm_dial.go

github.com/keltia/go-ipfs@v0.3.8-0.20150909044612-210793031c63/p2p/net/swarm/swarm_dial.go (about)

     1  package swarm
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"math/rand"
     7  	"net"
     8  	"sync"
     9  	"time"
    10  
    11  	mconn "github.com/ipfs/go-ipfs/metrics/conn"
    12  	conn "github.com/ipfs/go-ipfs/p2p/net/conn"
    13  	addrutil "github.com/ipfs/go-ipfs/p2p/net/swarm/addr"
    14  	peer "github.com/ipfs/go-ipfs/p2p/peer"
    15  	lgbl "github.com/ipfs/go-ipfs/util/eventlog/loggables"
    16  
    17  	ma "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/go-multiaddr"
    18  	manet "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/go-multiaddr-net"
    19  	process "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess"
    20  	processctx "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess/context"
    21  	ratelimit "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess/ratelimit"
    22  	context "github.com/ipfs/go-ipfs/Godeps/_workspace/src/golang.org/x/net/context"
    23  )
    24  
    25  // Diagram of dial sync:
    26  //
    27  //   many callers of Dial()   synched w.  dials many addrs       results to callers
    28  //  ----------------------\    dialsync    use earliest            /--------------
    29  //  -----------------------\              |----------\           /----------------
    30  //  ------------------------>------------<-------     >---------<-----------------
    31  //  -----------------------|              \----x                 \----------------
    32  //  ----------------------|                \-----x                \---------------
    33  //                                         any may fail          if no addr at end
    34  //                                                             retry dialAttempt x
    35  
    36  var (
    37  	ErrDialBackoff = errors.New("dial backoff")
    38  	ErrDialFailed  = errors.New("dial attempt failed")
    39  	ErrDialToSelf  = errors.New("dial to self attempted")
    40  )
    41  
    42  // dialAttempts governs how many times a goroutine will try to dial a given peer.
    43  // Note: this is down to one, as we have _too many dials_ atm. To add back in,
    44  // add loop back in Dial(.)
    45  const dialAttempts = 1
    46  
    47  // DialTimeout is the amount of time each dial attempt has. We can think about making
    48  // this larger down the road, or putting more granular timeouts (i.e. within each
    49  // subcomponent of Dial)
    50  var DialTimeout time.Duration = time.Second * 10
    51  
    52  // dialsync is a small object that helps manage ongoing dials.
    53  // this way, if we receive many simultaneous dial requests, one
    54  // can do its thing, while the rest wait.
    55  //
    56  // this interface is so would-be dialers can just:
    57  //
    58  //  for {
    59  //  	c := findConnectionToPeer(peer)
    60  //  	if c != nil {
    61  //  		return c
    62  //  	}
    63  //
    64  //  	// ok, no connections. should we dial?
    65  //  	if ok, wait := dialsync.Lock(peer); !ok {
    66  //  		<-wait // can optionally wait
    67  //  		continue
    68  //  	}
    69  //  	defer dialsync.Unlock(peer)
    70  //
    71  //  	c := actuallyDial(peer)
    72  //  	return c
    73  //  }
    74  //
    75  type dialsync struct {
    76  	// ongoing is a map of tickets for the current peers being dialed.
    77  	// this way, we dont kick off N dials simultaneously.
    78  	ongoing map[peer.ID]chan struct{}
    79  	lock    sync.Mutex
    80  }
    81  
    82  // Lock governs the beginning of a dial attempt.
    83  // If there are no ongoing dials, it returns true, and the client is now
    84  // scheduled to dial. Every other goroutine that calls startDial -- with
    85  //the same dst -- will block until client is done. The client MUST call
    86  // ds.Unlock(p) when it is done, to unblock the other callers.
    87  // The client is not reponsible for achieving a successful dial, only for
    88  // reporting the end of the attempt (calling ds.Unlock(p)).
    89  //
    90  // see the example below `dialsync`
    91  func (ds *dialsync) Lock(dst peer.ID) (bool, chan struct{}) {
    92  	ds.lock.Lock()
    93  	if ds.ongoing == nil { // init if not ready
    94  		ds.ongoing = make(map[peer.ID]chan struct{})
    95  	}
    96  	wait, found := ds.ongoing[dst]
    97  	if !found {
    98  		ds.ongoing[dst] = make(chan struct{})
    99  	}
   100  	ds.lock.Unlock()
   101  
   102  	if found {
   103  		return false, wait
   104  	}
   105  
   106  	// ok! you're signed up to dial!
   107  	return true, nil
   108  }
   109  
   110  // Unlock releases waiters to a dial attempt. see Lock.
   111  // if Unlock(p) is called without calling Lock(p) first, Unlock panics.
   112  func (ds *dialsync) Unlock(dst peer.ID) {
   113  	ds.lock.Lock()
   114  	wait, found := ds.ongoing[dst]
   115  	if !found {
   116  		panic("called dialDone with no ongoing dials to peer: " + dst.Pretty())
   117  	}
   118  	delete(ds.ongoing, dst) // remove ongoing dial
   119  	close(wait)             // release everyone else
   120  	ds.lock.Unlock()
   121  }
   122  
   123  // dialbackoff is a struct used to avoid over-dialing the same, dead peers.
   124  // Whenever we totally time out on a peer (all three attempts), we add them
   125  // to dialbackoff. Then, whenevers goroutines would _wait_ (dialsync), they
   126  // check dialbackoff. If it's there, they don't wait and exit promptly with
   127  // an error. (the single goroutine that is actually dialing continues to
   128  // dial). If a dial is successful, the peer is removed from backoff.
   129  // Example:
   130  //
   131  //  for {
   132  //  	if ok, wait := dialsync.Lock(p); !ok {
   133  //  		if backoff.Backoff(p) {
   134  //  			return errDialFailed
   135  //  		}
   136  //  		<-wait
   137  //  		continue
   138  //  	}
   139  //  	defer dialsync.Unlock(p)
   140  //  	c, err := actuallyDial(p)
   141  //  	if err != nil {
   142  //  		dialbackoff.AddBackoff(p)
   143  //  		continue
   144  //  	}
   145  //  	dialbackoff.Clear(p)
   146  //  }
   147  //
   148  type dialbackoff struct {
   149  	entries map[peer.ID]struct{}
   150  	lock    sync.RWMutex
   151  }
   152  
   153  func (db *dialbackoff) init() {
   154  	if db.entries == nil {
   155  		db.entries = make(map[peer.ID]struct{})
   156  	}
   157  }
   158  
   159  // Backoff returns whether the client should backoff from dialing
   160  // peeer p
   161  func (db *dialbackoff) Backoff(p peer.ID) bool {
   162  	db.lock.Lock()
   163  	db.init()
   164  	_, found := db.entries[p]
   165  	db.lock.Unlock()
   166  	return found
   167  }
   168  
   169  // AddBackoff lets other nodes know that we've entered backoff with
   170  // peer p, so dialers should not wait unnecessarily. We still will
   171  // attempt to dial with one goroutine, in case we get through.
   172  func (db *dialbackoff) AddBackoff(p peer.ID) {
   173  	db.lock.Lock()
   174  	db.init()
   175  	db.entries[p] = struct{}{}
   176  	db.lock.Unlock()
   177  }
   178  
   179  // Clear removes a backoff record. Clients should call this after a
   180  // successful Dial.
   181  func (db *dialbackoff) Clear(p peer.ID) {
   182  	db.lock.Lock()
   183  	db.init()
   184  	delete(db.entries, p)
   185  	db.lock.Unlock()
   186  }
   187  
   188  // Dial connects to a peer.
   189  //
   190  // The idea is that the client of Swarm does not need to know what network
   191  // the connection will happen over. Swarm can use whichever it choses.
   192  // This allows us to use various transport protocols, do NAT traversal/relay,
   193  // etc. to achive connection.
   194  func (s *Swarm) Dial(ctx context.Context, p peer.ID) (*Conn, error) {
   195  	var logdial = lgbl.Dial("swarm", s.LocalPeer(), p, nil, nil)
   196  	if p == s.local {
   197  		log.Event(ctx, "swarmDialSelf", logdial)
   198  		return nil, ErrDialToSelf
   199  	}
   200  
   201  	return s.gatedDialAttempt(ctx, p)
   202  }
   203  
   204  func (s *Swarm) bestConnectionToPeer(p peer.ID) *Conn {
   205  	cs := s.ConnectionsToPeer(p)
   206  	for _, conn := range cs {
   207  		if conn != nil { // dump out the first one we find. (TODO pick better)
   208  			return conn
   209  		}
   210  	}
   211  	return nil
   212  }
   213  
   214  // gatedDialAttempt is an attempt to dial a node. It is gated by the swarm's
   215  // dial synchronization systems: dialsync and dialbackoff.
   216  func (s *Swarm) gatedDialAttempt(ctx context.Context, p peer.ID) (*Conn, error) {
   217  	var logdial = lgbl.Dial("swarm", s.LocalPeer(), p, nil, nil)
   218  	defer log.EventBegin(ctx, "swarmDialAttemptSync", logdial).Done()
   219  
   220  	// check if we already have an open connection first
   221  	conn := s.bestConnectionToPeer(p)
   222  	if conn != nil {
   223  		return conn, nil
   224  	}
   225  
   226  	// check if there's an ongoing dial to this peer
   227  	if ok, wait := s.dsync.Lock(p); ok {
   228  		// ok, we have been charged to dial! let's do it.
   229  		// if it succeeds, dial will add the conn to the swarm itself.
   230  
   231  		defer log.EventBegin(ctx, "swarmDialAttemptStart", logdial).Done()
   232  		ctxT, cancel := context.WithTimeout(ctx, s.dialT)
   233  		conn, err := s.dial(ctxT, p)
   234  		cancel()
   235  		s.dsync.Unlock(p)
   236  		log.Debugf("dial end %s", conn)
   237  		if err != nil {
   238  			log.Event(ctx, "swarmDialBackoffAdd", logdial)
   239  			s.backf.AddBackoff(p) // let others know to backoff
   240  
   241  			// ok, we failed. try again. (if loop is done, our error is output)
   242  			return nil, fmt.Errorf("dial attempt failed: %s", err)
   243  		}
   244  		log.Event(ctx, "swarmDialBackoffClear", logdial)
   245  		s.backf.Clear(p) // okay, no longer need to backoff
   246  		return conn, nil
   247  
   248  	} else {
   249  		// we did not dial. we must wait for someone else to dial.
   250  
   251  		// check whether we should backoff first...
   252  		if s.backf.Backoff(p) {
   253  			log.Event(ctx, "swarmDialBackoff", logdial)
   254  			return nil, ErrDialBackoff
   255  		}
   256  
   257  		defer log.EventBegin(ctx, "swarmDialWait", logdial).Done()
   258  		select {
   259  		case <-wait: // wait for that other dial to finish.
   260  
   261  			// see if it worked, OR we got an incoming dial in the meantime...
   262  			conn := s.bestConnectionToPeer(p)
   263  			if conn != nil {
   264  				return conn, nil
   265  			}
   266  			return nil, ErrDialFailed
   267  		case <-ctx.Done(): // or we may have to bail...
   268  			return nil, ctx.Err()
   269  		}
   270  	}
   271  }
   272  
   273  // dial is the actual swarm's dial logic, gated by Dial.
   274  func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) {
   275  	var logdial = lgbl.Dial("swarm", s.LocalPeer(), p, nil, nil)
   276  	if p == s.local {
   277  		log.Event(ctx, "swarmDialDoDialSelf", logdial)
   278  		return nil, ErrDialToSelf
   279  	}
   280  	defer log.EventBegin(ctx, "swarmDialDo", logdial).Done()
   281  	logdial["dial"] = "failure" // start off with failure. set to "success" at the end.
   282  
   283  	sk := s.peers.PrivKey(s.local)
   284  	logdial["encrypted"] = (sk != nil) // log wether this will be an encrypted dial or not.
   285  	if sk == nil {
   286  		// fine for sk to be nil, just log.
   287  		log.Debug("Dial not given PrivateKey, so WILL NOT SECURE conn.")
   288  	}
   289  
   290  	// get our own addrs. try dialing out from our listener addresses (reusing ports)
   291  	// Note that using our peerstore's addresses here is incorrect, as that would
   292  	// include observed addresses. TODO: make peerstore's address book smarter.
   293  	localAddrs := s.ListenAddresses()
   294  	if len(localAddrs) == 0 {
   295  		log.Debug("Dialing out with no local addresses.")
   296  	}
   297  
   298  	// get remote peer addrs
   299  	remoteAddrs := s.peers.Addrs(p)
   300  	// make sure we can use the addresses.
   301  	remoteAddrs = addrutil.FilterUsableAddrs(remoteAddrs)
   302  	// drop out any addrs that would just dial ourselves. use ListenAddresses
   303  	// as that is a more authoritative view than localAddrs.
   304  	ila, _ := s.InterfaceListenAddresses()
   305  	remoteAddrs = addrutil.Subtract(remoteAddrs, ila)
   306  	remoteAddrs = addrutil.Subtract(remoteAddrs, s.peers.Addrs(s.local))
   307  
   308  	log.Debugf("%s swarm dialing %s -- local:%s remote:%s", s.local, p, s.ListenAddresses(), remoteAddrs)
   309  	if len(remoteAddrs) == 0 {
   310  		err := errors.New("peer has no addresses")
   311  		logdial["error"] = err
   312  		return nil, err
   313  	}
   314  
   315  	remoteAddrs = s.filterAddrs(remoteAddrs)
   316  	if len(remoteAddrs) == 0 {
   317  		err := errors.New("all adresses for peer have been filtered out")
   318  		logdial["error"] = err
   319  		return nil, err
   320  	}
   321  
   322  	// open connection to peer
   323  	d := &conn.Dialer{
   324  		Dialer: manet.Dialer{
   325  			Dialer: net.Dialer{
   326  				Timeout: s.dialT,
   327  			},
   328  		},
   329  		LocalPeer:  s.local,
   330  		LocalAddrs: localAddrs,
   331  		PrivateKey: sk,
   332  		Wrapper: func(c manet.Conn) manet.Conn {
   333  			return mconn.WrapConn(s.bwc, c)
   334  		},
   335  	}
   336  
   337  	// try to get a connection to any addr
   338  	connC, err := s.dialAddrs(ctx, d, p, remoteAddrs)
   339  	if err != nil {
   340  		logdial["error"] = err
   341  		return nil, err
   342  	}
   343  	logdial["netconn"] = lgbl.NetConn(connC)
   344  
   345  	// ok try to setup the new connection.
   346  	defer log.EventBegin(ctx, "swarmDialDoSetup", logdial, lgbl.NetConn(connC)).Done()
   347  	swarmC, err := dialConnSetup(ctx, s, connC)
   348  	if err != nil {
   349  		logdial["error"] = err
   350  		connC.Close() // close the connection. didn't work out :(
   351  		return nil, err
   352  	}
   353  
   354  	logdial["dial"] = "success"
   355  	return swarmC, nil
   356  }
   357  
   358  func (s *Swarm) dialAddrs(ctx context.Context, d *conn.Dialer, p peer.ID, remoteAddrs []ma.Multiaddr) (conn.Conn, error) {
   359  
   360  	// try to connect to one of the peer's known addresses.
   361  	// we dial concurrently to each of the addresses, which:
   362  	// * makes the process faster overall
   363  	// * attempts to get the fastest connection available.
   364  	// * mitigates the waste of trying bad addresses
   365  	log.Debugf("%s swarm dialing %s %s", s.local, p, remoteAddrs)
   366  
   367  	ctx, cancel := context.WithCancel(ctx)
   368  	defer cancel() // cancel work when we exit func
   369  
   370  	foundConn := make(chan struct{})
   371  	conns := make(chan conn.Conn, len(remoteAddrs))
   372  	errs := make(chan error, len(remoteAddrs))
   373  
   374  	// dialSingleAddr is used in the rate-limited async thing below.
   375  	dialSingleAddr := func(addr ma.Multiaddr) {
   376  		connC, err := s.dialAddr(ctx, d, p, addr)
   377  
   378  		// check parent still wants our results
   379  		select {
   380  		case <-foundConn:
   381  			if connC != nil {
   382  				connC.Close()
   383  			}
   384  			return
   385  		default:
   386  		}
   387  
   388  		if err != nil {
   389  			errs <- err
   390  		} else if connC == nil {
   391  			errs <- fmt.Errorf("failed to dial %s %s", p, addr)
   392  		} else {
   393  			conns <- connC
   394  		}
   395  	}
   396  
   397  	// this whole thing is in a goroutine so we can use foundConn
   398  	// to end early.
   399  	go func() {
   400  		// rate limiting just in case. at most 10 addrs at once.
   401  		limiter := ratelimit.NewRateLimiter(process.Background(), 10)
   402  		limiter.Go(func(worker process.Process) {
   403  			// permute addrs so we try different sets first each time.
   404  			for _, i := range rand.Perm(len(remoteAddrs)) {
   405  				select {
   406  				case <-foundConn: // if one of them succeeded already
   407  					break
   408  				case <-worker.Closing(): // our context was cancelled
   409  					break
   410  				default:
   411  				}
   412  
   413  				workerAddr := remoteAddrs[i] // shadow variable to avoid race
   414  				limiter.LimitedGo(func(worker process.Process) {
   415  					dialSingleAddr(workerAddr)
   416  				})
   417  			}
   418  		})
   419  
   420  		processctx.CloseAfterContext(limiter, ctx)
   421  	}()
   422  
   423  	// wair fot the results.
   424  	exitErr := fmt.Errorf("failed to dial %s", p)
   425  	for i := 0; i < len(remoteAddrs); i++ {
   426  		select {
   427  		case exitErr = <-errs: //
   428  			log.Debug("dial error: ", exitErr)
   429  		case connC := <-conns:
   430  			// take the first + return asap
   431  			close(foundConn)
   432  			return connC, nil
   433  		}
   434  	}
   435  	return nil, exitErr
   436  }
   437  
   438  func (s *Swarm) dialAddr(ctx context.Context, d *conn.Dialer, p peer.ID, addr ma.Multiaddr) (conn.Conn, error) {
   439  	log.Debugf("%s swarm dialing %s %s", s.local, p, addr)
   440  
   441  	connC, err := d.Dial(ctx, addr, p)
   442  	if err != nil {
   443  		return nil, fmt.Errorf("%s --> %s dial attempt failed: %s", s.local, p, err)
   444  	}
   445  
   446  	// if the connection is not to whom we thought it would be...
   447  	remotep := connC.RemotePeer()
   448  	if remotep != p {
   449  		connC.Close()
   450  		return nil, fmt.Errorf("misdial to %s through %s (got %s)", p, addr, remotep)
   451  	}
   452  
   453  	// if the connection is to ourselves...
   454  	// this can happen TONS when Loopback addrs are advertized.
   455  	// (this should be caught by two checks above, but let's just make sure.)
   456  	if remotep == s.local {
   457  		connC.Close()
   458  		return nil, fmt.Errorf("misdial to %s through %s (got self)", p, addr)
   459  	}
   460  
   461  	// success! we got one!
   462  	return connC, nil
   463  }
   464  
   465  func (s *Swarm) filterAddrs(addrs []ma.Multiaddr) []ma.Multiaddr {
   466  	var out []ma.Multiaddr
   467  	for _, a := range addrs {
   468  		if !s.Filters.AddrBlocked(a) {
   469  			out = append(out, a)
   470  		}
   471  	}
   472  	return out
   473  }
   474  
   475  // dialConnSetup is the setup logic for a connection from the dial side. it
   476  // needs to add the Conn to the StreamSwarm, then run newConnSetup
   477  func dialConnSetup(ctx context.Context, s *Swarm, connC conn.Conn) (*Conn, error) {
   478  
   479  	psC, err := s.swarm.AddConn(connC)
   480  	if err != nil {
   481  		// connC is closed by caller if we fail.
   482  		return nil, fmt.Errorf("failed to add conn to ps.Swarm: %s", err)
   483  	}
   484  
   485  	// ok try to setup the new connection. (newConnSetup will add to group)
   486  	swarmC, err := s.newConnSetup(ctx, psC)
   487  	if err != nil {
   488  		psC.Close() // we need to make sure psC is Closed.
   489  		return nil, err
   490  	}
   491  
   492  	return swarmC, err
   493  }