github.com/keltia/go-ipfs@v0.3.8-0.20150909044612-210793031c63/p2p/net/swarm/swarm_dial.go (about) 1 package swarm 2 3 import ( 4 "errors" 5 "fmt" 6 "math/rand" 7 "net" 8 "sync" 9 "time" 10 11 mconn "github.com/ipfs/go-ipfs/metrics/conn" 12 conn "github.com/ipfs/go-ipfs/p2p/net/conn" 13 addrutil "github.com/ipfs/go-ipfs/p2p/net/swarm/addr" 14 peer "github.com/ipfs/go-ipfs/p2p/peer" 15 lgbl "github.com/ipfs/go-ipfs/util/eventlog/loggables" 16 17 ma "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/go-multiaddr" 18 manet "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/go-multiaddr-net" 19 process "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess" 20 processctx "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess/context" 21 ratelimit "github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess/ratelimit" 22 context "github.com/ipfs/go-ipfs/Godeps/_workspace/src/golang.org/x/net/context" 23 ) 24 25 // Diagram of dial sync: 26 // 27 // many callers of Dial() synched w. dials many addrs results to callers 28 // ----------------------\ dialsync use earliest /-------------- 29 // -----------------------\ |----------\ /---------------- 30 // ------------------------>------------<------- >---------<----------------- 31 // -----------------------| \----x \---------------- 32 // ----------------------| \-----x \--------------- 33 // any may fail if no addr at end 34 // retry dialAttempt x 35 36 var ( 37 ErrDialBackoff = errors.New("dial backoff") 38 ErrDialFailed = errors.New("dial attempt failed") 39 ErrDialToSelf = errors.New("dial to self attempted") 40 ) 41 42 // dialAttempts governs how many times a goroutine will try to dial a given peer. 43 // Note: this is down to one, as we have _too many dials_ atm. To add back in, 44 // add loop back in Dial(.) 45 const dialAttempts = 1 46 47 // DialTimeout is the amount of time each dial attempt has. We can think about making 48 // this larger down the road, or putting more granular timeouts (i.e. within each 49 // subcomponent of Dial) 50 var DialTimeout time.Duration = time.Second * 10 51 52 // dialsync is a small object that helps manage ongoing dials. 53 // this way, if we receive many simultaneous dial requests, one 54 // can do its thing, while the rest wait. 55 // 56 // this interface is so would-be dialers can just: 57 // 58 // for { 59 // c := findConnectionToPeer(peer) 60 // if c != nil { 61 // return c 62 // } 63 // 64 // // ok, no connections. should we dial? 65 // if ok, wait := dialsync.Lock(peer); !ok { 66 // <-wait // can optionally wait 67 // continue 68 // } 69 // defer dialsync.Unlock(peer) 70 // 71 // c := actuallyDial(peer) 72 // return c 73 // } 74 // 75 type dialsync struct { 76 // ongoing is a map of tickets for the current peers being dialed. 77 // this way, we dont kick off N dials simultaneously. 78 ongoing map[peer.ID]chan struct{} 79 lock sync.Mutex 80 } 81 82 // Lock governs the beginning of a dial attempt. 83 // If there are no ongoing dials, it returns true, and the client is now 84 // scheduled to dial. Every other goroutine that calls startDial -- with 85 //the same dst -- will block until client is done. The client MUST call 86 // ds.Unlock(p) when it is done, to unblock the other callers. 87 // The client is not reponsible for achieving a successful dial, only for 88 // reporting the end of the attempt (calling ds.Unlock(p)). 89 // 90 // see the example below `dialsync` 91 func (ds *dialsync) Lock(dst peer.ID) (bool, chan struct{}) { 92 ds.lock.Lock() 93 if ds.ongoing == nil { // init if not ready 94 ds.ongoing = make(map[peer.ID]chan struct{}) 95 } 96 wait, found := ds.ongoing[dst] 97 if !found { 98 ds.ongoing[dst] = make(chan struct{}) 99 } 100 ds.lock.Unlock() 101 102 if found { 103 return false, wait 104 } 105 106 // ok! you're signed up to dial! 107 return true, nil 108 } 109 110 // Unlock releases waiters to a dial attempt. see Lock. 111 // if Unlock(p) is called without calling Lock(p) first, Unlock panics. 112 func (ds *dialsync) Unlock(dst peer.ID) { 113 ds.lock.Lock() 114 wait, found := ds.ongoing[dst] 115 if !found { 116 panic("called dialDone with no ongoing dials to peer: " + dst.Pretty()) 117 } 118 delete(ds.ongoing, dst) // remove ongoing dial 119 close(wait) // release everyone else 120 ds.lock.Unlock() 121 } 122 123 // dialbackoff is a struct used to avoid over-dialing the same, dead peers. 124 // Whenever we totally time out on a peer (all three attempts), we add them 125 // to dialbackoff. Then, whenevers goroutines would _wait_ (dialsync), they 126 // check dialbackoff. If it's there, they don't wait and exit promptly with 127 // an error. (the single goroutine that is actually dialing continues to 128 // dial). If a dial is successful, the peer is removed from backoff. 129 // Example: 130 // 131 // for { 132 // if ok, wait := dialsync.Lock(p); !ok { 133 // if backoff.Backoff(p) { 134 // return errDialFailed 135 // } 136 // <-wait 137 // continue 138 // } 139 // defer dialsync.Unlock(p) 140 // c, err := actuallyDial(p) 141 // if err != nil { 142 // dialbackoff.AddBackoff(p) 143 // continue 144 // } 145 // dialbackoff.Clear(p) 146 // } 147 // 148 type dialbackoff struct { 149 entries map[peer.ID]struct{} 150 lock sync.RWMutex 151 } 152 153 func (db *dialbackoff) init() { 154 if db.entries == nil { 155 db.entries = make(map[peer.ID]struct{}) 156 } 157 } 158 159 // Backoff returns whether the client should backoff from dialing 160 // peeer p 161 func (db *dialbackoff) Backoff(p peer.ID) bool { 162 db.lock.Lock() 163 db.init() 164 _, found := db.entries[p] 165 db.lock.Unlock() 166 return found 167 } 168 169 // AddBackoff lets other nodes know that we've entered backoff with 170 // peer p, so dialers should not wait unnecessarily. We still will 171 // attempt to dial with one goroutine, in case we get through. 172 func (db *dialbackoff) AddBackoff(p peer.ID) { 173 db.lock.Lock() 174 db.init() 175 db.entries[p] = struct{}{} 176 db.lock.Unlock() 177 } 178 179 // Clear removes a backoff record. Clients should call this after a 180 // successful Dial. 181 func (db *dialbackoff) Clear(p peer.ID) { 182 db.lock.Lock() 183 db.init() 184 delete(db.entries, p) 185 db.lock.Unlock() 186 } 187 188 // Dial connects to a peer. 189 // 190 // The idea is that the client of Swarm does not need to know what network 191 // the connection will happen over. Swarm can use whichever it choses. 192 // This allows us to use various transport protocols, do NAT traversal/relay, 193 // etc. to achive connection. 194 func (s *Swarm) Dial(ctx context.Context, p peer.ID) (*Conn, error) { 195 var logdial = lgbl.Dial("swarm", s.LocalPeer(), p, nil, nil) 196 if p == s.local { 197 log.Event(ctx, "swarmDialSelf", logdial) 198 return nil, ErrDialToSelf 199 } 200 201 return s.gatedDialAttempt(ctx, p) 202 } 203 204 func (s *Swarm) bestConnectionToPeer(p peer.ID) *Conn { 205 cs := s.ConnectionsToPeer(p) 206 for _, conn := range cs { 207 if conn != nil { // dump out the first one we find. (TODO pick better) 208 return conn 209 } 210 } 211 return nil 212 } 213 214 // gatedDialAttempt is an attempt to dial a node. It is gated by the swarm's 215 // dial synchronization systems: dialsync and dialbackoff. 216 func (s *Swarm) gatedDialAttempt(ctx context.Context, p peer.ID) (*Conn, error) { 217 var logdial = lgbl.Dial("swarm", s.LocalPeer(), p, nil, nil) 218 defer log.EventBegin(ctx, "swarmDialAttemptSync", logdial).Done() 219 220 // check if we already have an open connection first 221 conn := s.bestConnectionToPeer(p) 222 if conn != nil { 223 return conn, nil 224 } 225 226 // check if there's an ongoing dial to this peer 227 if ok, wait := s.dsync.Lock(p); ok { 228 // ok, we have been charged to dial! let's do it. 229 // if it succeeds, dial will add the conn to the swarm itself. 230 231 defer log.EventBegin(ctx, "swarmDialAttemptStart", logdial).Done() 232 ctxT, cancel := context.WithTimeout(ctx, s.dialT) 233 conn, err := s.dial(ctxT, p) 234 cancel() 235 s.dsync.Unlock(p) 236 log.Debugf("dial end %s", conn) 237 if err != nil { 238 log.Event(ctx, "swarmDialBackoffAdd", logdial) 239 s.backf.AddBackoff(p) // let others know to backoff 240 241 // ok, we failed. try again. (if loop is done, our error is output) 242 return nil, fmt.Errorf("dial attempt failed: %s", err) 243 } 244 log.Event(ctx, "swarmDialBackoffClear", logdial) 245 s.backf.Clear(p) // okay, no longer need to backoff 246 return conn, nil 247 248 } else { 249 // we did not dial. we must wait for someone else to dial. 250 251 // check whether we should backoff first... 252 if s.backf.Backoff(p) { 253 log.Event(ctx, "swarmDialBackoff", logdial) 254 return nil, ErrDialBackoff 255 } 256 257 defer log.EventBegin(ctx, "swarmDialWait", logdial).Done() 258 select { 259 case <-wait: // wait for that other dial to finish. 260 261 // see if it worked, OR we got an incoming dial in the meantime... 262 conn := s.bestConnectionToPeer(p) 263 if conn != nil { 264 return conn, nil 265 } 266 return nil, ErrDialFailed 267 case <-ctx.Done(): // or we may have to bail... 268 return nil, ctx.Err() 269 } 270 } 271 } 272 273 // dial is the actual swarm's dial logic, gated by Dial. 274 func (s *Swarm) dial(ctx context.Context, p peer.ID) (*Conn, error) { 275 var logdial = lgbl.Dial("swarm", s.LocalPeer(), p, nil, nil) 276 if p == s.local { 277 log.Event(ctx, "swarmDialDoDialSelf", logdial) 278 return nil, ErrDialToSelf 279 } 280 defer log.EventBegin(ctx, "swarmDialDo", logdial).Done() 281 logdial["dial"] = "failure" // start off with failure. set to "success" at the end. 282 283 sk := s.peers.PrivKey(s.local) 284 logdial["encrypted"] = (sk != nil) // log wether this will be an encrypted dial or not. 285 if sk == nil { 286 // fine for sk to be nil, just log. 287 log.Debug("Dial not given PrivateKey, so WILL NOT SECURE conn.") 288 } 289 290 // get our own addrs. try dialing out from our listener addresses (reusing ports) 291 // Note that using our peerstore's addresses here is incorrect, as that would 292 // include observed addresses. TODO: make peerstore's address book smarter. 293 localAddrs := s.ListenAddresses() 294 if len(localAddrs) == 0 { 295 log.Debug("Dialing out with no local addresses.") 296 } 297 298 // get remote peer addrs 299 remoteAddrs := s.peers.Addrs(p) 300 // make sure we can use the addresses. 301 remoteAddrs = addrutil.FilterUsableAddrs(remoteAddrs) 302 // drop out any addrs that would just dial ourselves. use ListenAddresses 303 // as that is a more authoritative view than localAddrs. 304 ila, _ := s.InterfaceListenAddresses() 305 remoteAddrs = addrutil.Subtract(remoteAddrs, ila) 306 remoteAddrs = addrutil.Subtract(remoteAddrs, s.peers.Addrs(s.local)) 307 308 log.Debugf("%s swarm dialing %s -- local:%s remote:%s", s.local, p, s.ListenAddresses(), remoteAddrs) 309 if len(remoteAddrs) == 0 { 310 err := errors.New("peer has no addresses") 311 logdial["error"] = err 312 return nil, err 313 } 314 315 remoteAddrs = s.filterAddrs(remoteAddrs) 316 if len(remoteAddrs) == 0 { 317 err := errors.New("all adresses for peer have been filtered out") 318 logdial["error"] = err 319 return nil, err 320 } 321 322 // open connection to peer 323 d := &conn.Dialer{ 324 Dialer: manet.Dialer{ 325 Dialer: net.Dialer{ 326 Timeout: s.dialT, 327 }, 328 }, 329 LocalPeer: s.local, 330 LocalAddrs: localAddrs, 331 PrivateKey: sk, 332 Wrapper: func(c manet.Conn) manet.Conn { 333 return mconn.WrapConn(s.bwc, c) 334 }, 335 } 336 337 // try to get a connection to any addr 338 connC, err := s.dialAddrs(ctx, d, p, remoteAddrs) 339 if err != nil { 340 logdial["error"] = err 341 return nil, err 342 } 343 logdial["netconn"] = lgbl.NetConn(connC) 344 345 // ok try to setup the new connection. 346 defer log.EventBegin(ctx, "swarmDialDoSetup", logdial, lgbl.NetConn(connC)).Done() 347 swarmC, err := dialConnSetup(ctx, s, connC) 348 if err != nil { 349 logdial["error"] = err 350 connC.Close() // close the connection. didn't work out :( 351 return nil, err 352 } 353 354 logdial["dial"] = "success" 355 return swarmC, nil 356 } 357 358 func (s *Swarm) dialAddrs(ctx context.Context, d *conn.Dialer, p peer.ID, remoteAddrs []ma.Multiaddr) (conn.Conn, error) { 359 360 // try to connect to one of the peer's known addresses. 361 // we dial concurrently to each of the addresses, which: 362 // * makes the process faster overall 363 // * attempts to get the fastest connection available. 364 // * mitigates the waste of trying bad addresses 365 log.Debugf("%s swarm dialing %s %s", s.local, p, remoteAddrs) 366 367 ctx, cancel := context.WithCancel(ctx) 368 defer cancel() // cancel work when we exit func 369 370 foundConn := make(chan struct{}) 371 conns := make(chan conn.Conn, len(remoteAddrs)) 372 errs := make(chan error, len(remoteAddrs)) 373 374 // dialSingleAddr is used in the rate-limited async thing below. 375 dialSingleAddr := func(addr ma.Multiaddr) { 376 connC, err := s.dialAddr(ctx, d, p, addr) 377 378 // check parent still wants our results 379 select { 380 case <-foundConn: 381 if connC != nil { 382 connC.Close() 383 } 384 return 385 default: 386 } 387 388 if err != nil { 389 errs <- err 390 } else if connC == nil { 391 errs <- fmt.Errorf("failed to dial %s %s", p, addr) 392 } else { 393 conns <- connC 394 } 395 } 396 397 // this whole thing is in a goroutine so we can use foundConn 398 // to end early. 399 go func() { 400 // rate limiting just in case. at most 10 addrs at once. 401 limiter := ratelimit.NewRateLimiter(process.Background(), 10) 402 limiter.Go(func(worker process.Process) { 403 // permute addrs so we try different sets first each time. 404 for _, i := range rand.Perm(len(remoteAddrs)) { 405 select { 406 case <-foundConn: // if one of them succeeded already 407 break 408 case <-worker.Closing(): // our context was cancelled 409 break 410 default: 411 } 412 413 workerAddr := remoteAddrs[i] // shadow variable to avoid race 414 limiter.LimitedGo(func(worker process.Process) { 415 dialSingleAddr(workerAddr) 416 }) 417 } 418 }) 419 420 processctx.CloseAfterContext(limiter, ctx) 421 }() 422 423 // wair fot the results. 424 exitErr := fmt.Errorf("failed to dial %s", p) 425 for i := 0; i < len(remoteAddrs); i++ { 426 select { 427 case exitErr = <-errs: // 428 log.Debug("dial error: ", exitErr) 429 case connC := <-conns: 430 // take the first + return asap 431 close(foundConn) 432 return connC, nil 433 } 434 } 435 return nil, exitErr 436 } 437 438 func (s *Swarm) dialAddr(ctx context.Context, d *conn.Dialer, p peer.ID, addr ma.Multiaddr) (conn.Conn, error) { 439 log.Debugf("%s swarm dialing %s %s", s.local, p, addr) 440 441 connC, err := d.Dial(ctx, addr, p) 442 if err != nil { 443 return nil, fmt.Errorf("%s --> %s dial attempt failed: %s", s.local, p, err) 444 } 445 446 // if the connection is not to whom we thought it would be... 447 remotep := connC.RemotePeer() 448 if remotep != p { 449 connC.Close() 450 return nil, fmt.Errorf("misdial to %s through %s (got %s)", p, addr, remotep) 451 } 452 453 // if the connection is to ourselves... 454 // this can happen TONS when Loopback addrs are advertized. 455 // (this should be caught by two checks above, but let's just make sure.) 456 if remotep == s.local { 457 connC.Close() 458 return nil, fmt.Errorf("misdial to %s through %s (got self)", p, addr) 459 } 460 461 // success! we got one! 462 return connC, nil 463 } 464 465 func (s *Swarm) filterAddrs(addrs []ma.Multiaddr) []ma.Multiaddr { 466 var out []ma.Multiaddr 467 for _, a := range addrs { 468 if !s.Filters.AddrBlocked(a) { 469 out = append(out, a) 470 } 471 } 472 return out 473 } 474 475 // dialConnSetup is the setup logic for a connection from the dial side. it 476 // needs to add the Conn to the StreamSwarm, then run newConnSetup 477 func dialConnSetup(ctx context.Context, s *Swarm, connC conn.Conn) (*Conn, error) { 478 479 psC, err := s.swarm.AddConn(connC) 480 if err != nil { 481 // connC is closed by caller if we fail. 482 return nil, fmt.Errorf("failed to add conn to ps.Swarm: %s", err) 483 } 484 485 // ok try to setup the new connection. (newConnSetup will add to group) 486 swarmC, err := s.newConnSetup(ctx, psC) 487 if err != nil { 488 psC.Close() // we need to make sure psC is Closed. 489 return nil, err 490 } 491 492 return swarmC, err 493 }