github.com/Psiphon-Labs/psiphon-tunnel-core@v2.0.28+incompatible/psiphon/controller.go (about) 1 /* 2 * Copyright (c) 2015, Psiphon Inc. 3 * All rights reserved. 4 * 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation, either version 3 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 // Package psiphon implements the core tunnel functionality of a Psiphon client. 21 // The main function is RunForever, which runs a Controller that obtains lists of 22 // servers, establishes tunnel connections, and runs local proxies through which 23 // tunneled traffic may be sent. 24 package psiphon 25 26 import ( 27 "context" 28 "fmt" 29 "math/rand" 30 "net" 31 "runtime" 32 "sync" 33 "sync/atomic" 34 "time" 35 36 "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common" 37 "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/errors" 38 "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/parameters" 39 "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/prng" 40 "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/protocol" 41 "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/resolver" 42 "github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/tun" 43 lrucache "github.com/cognusion/go-cache-lru" 44 ) 45 46 // Controller is a tunnel lifecycle coordinator. It manages lists of servers to 47 // connect to; establishes and monitors tunnels; and runs local proxies which 48 // route traffic through the tunnels. 49 type Controller struct { 50 config *Config 51 runCtx context.Context 52 stopRunning context.CancelFunc 53 runWaitGroup *sync.WaitGroup 54 connectedTunnels chan *Tunnel 55 failedTunnels chan *Tunnel 56 tunnelMutex sync.Mutex 57 establishedOnce bool 58 tunnelPoolSize int 59 tunnels []*Tunnel 60 nextTunnel int 61 isEstablishing bool 62 establishStartTime time.Time 63 protocolSelectionConstraints *protocolSelectionConstraints 64 concurrentEstablishTunnelsMutex sync.Mutex 65 establishConnectTunnelCount int 66 concurrentEstablishTunnels int 67 concurrentIntensiveEstablishTunnels int 68 peakConcurrentEstablishTunnels int 69 peakConcurrentIntensiveEstablishTunnels int 70 establishCtx context.Context 71 stopEstablish context.CancelFunc 72 establishWaitGroup *sync.WaitGroup 73 establishedTunnelsCount int32 74 candidateServerEntries chan *candidateServerEntry 75 untunneledDialConfig *DialConfig 76 untunneledSplitTunnelClassifications *lrucache.Cache 77 splitTunnelClassificationTTL time.Duration 78 splitTunnelClassificationMaxEntries int 79 signalFetchCommonRemoteServerList chan struct{} 80 signalFetchObfuscatedServerLists chan struct{} 81 signalDownloadUpgrade chan string 82 signalReportServerEntries chan *serverEntriesReportRequest 83 signalReportConnected chan struct{} 84 signalRestartEstablishing chan struct{} 85 serverAffinityDoneBroadcast chan struct{} 86 packetTunnelClient *tun.Client 87 packetTunnelTransport *PacketTunnelTransport 88 staggerMutex sync.Mutex 89 resolver *resolver.Resolver 90 } 91 92 // NewController initializes a new controller. 93 func NewController(config *Config) (controller *Controller, err error) { 94 95 if !config.IsCommitted() { 96 return nil, errors.TraceNew("uncommitted config") 97 } 98 99 // Needed by regen, at least 100 rand.Seed(int64(time.Now().Nanosecond())) 101 102 // The session ID for the Psiphon server API is used across all 103 // tunnels established by the controller. 104 NoticeSessionId(config.SessionID) 105 106 // Attempt to apply any valid, local stored tactics. The pre-done context 107 // ensures no tactics request is attempted now. 108 doneContext, cancelFunc := context.WithCancel(context.Background()) 109 cancelFunc() 110 GetTactics(doneContext, config) 111 112 p := config.GetParameters().Get() 113 splitTunnelClassificationTTL := 114 p.Duration(parameters.SplitTunnelClassificationTTL) 115 splitTunnelClassificationMaxEntries := 116 p.Int(parameters.SplitTunnelClassificationMaxEntries) 117 118 controller = &Controller{ 119 config: config, 120 runWaitGroup: new(sync.WaitGroup), 121 // connectedTunnels and failedTunnels buffer sizes are large enough to 122 // receive full pools of tunnels without blocking. Senders should not block. 123 connectedTunnels: make(chan *Tunnel, MAX_TUNNEL_POOL_SIZE), 124 failedTunnels: make(chan *Tunnel, MAX_TUNNEL_POOL_SIZE), 125 tunnelPoolSize: TUNNEL_POOL_SIZE, 126 tunnels: make([]*Tunnel, 0), 127 establishedOnce: false, 128 isEstablishing: false, 129 130 untunneledSplitTunnelClassifications: lrucache.NewWithLRU( 131 splitTunnelClassificationTTL, 132 1*time.Minute, 133 splitTunnelClassificationMaxEntries), 134 135 // TODO: Add a buffer of 1 so we don't miss a signal while receiver is 136 // starting? Trade-off is potential back-to-back fetch remotes. As-is, 137 // establish will eventually signal another fetch remote. 138 signalFetchCommonRemoteServerList: make(chan struct{}), 139 signalFetchObfuscatedServerLists: make(chan struct{}), 140 signalDownloadUpgrade: make(chan string), 141 signalReportConnected: make(chan struct{}), 142 143 // Using a buffer of 1 to ensure there's no race between the first signal 144 // sent and a channel receiver initializing; a side effect is that this 145 // allows 1 additional scan to enqueue while a scan is in progress, possibly 146 // resulting in one unnecessary scan. 147 signalReportServerEntries: make(chan *serverEntriesReportRequest, 1), 148 149 // signalRestartEstablishing has a buffer of 1 to ensure sending the 150 // signal doesn't block and receiving won't miss a signal. 151 signalRestartEstablishing: make(chan struct{}, 1), 152 } 153 154 // Initialize untunneledDialConfig, used by untunneled dials including 155 // remote server list and upgrade downloads. 156 controller.untunneledDialConfig = &DialConfig{ 157 UpstreamProxyURL: controller.config.UpstreamProxyURL, 158 CustomHeaders: controller.config.CustomHeaders, 159 DeviceBinder: controller.config.deviceBinder, 160 IPv6Synthesizer: controller.config.IPv6Synthesizer, 161 ResolveIP: func(ctx context.Context, hostname string) ([]net.IP, error) { 162 IPs, err := UntunneledResolveIP( 163 ctx, controller.config, controller.resolver, hostname) 164 if err != nil { 165 return nil, errors.Trace(err) 166 } 167 return IPs, nil 168 }, 169 TrustedCACertificatesFilename: controller.config.TrustedCACertificatesFilename, 170 } 171 172 if config.PacketTunnelTunFileDescriptor > 0 { 173 174 // Run a packet tunnel client. The lifetime of the tun.Client is the 175 // lifetime of the Controller, so it exists across tunnel establishments 176 // and reestablishments. The PacketTunnelTransport provides a layer 177 // that presents a continuosuly existing transport to the tun.Client; 178 // it's set to use new SSH channels after new SSH tunnel establishes. 179 180 packetTunnelTransport := NewPacketTunnelTransport() 181 182 packetTunnelClient, err := tun.NewClient(&tun.ClientConfig{ 183 Logger: NoticeCommonLogger(), 184 TunFileDescriptor: config.PacketTunnelTunFileDescriptor, 185 Transport: packetTunnelTransport, 186 }) 187 if err != nil { 188 return nil, errors.Trace(err) 189 } 190 191 controller.packetTunnelClient = packetTunnelClient 192 controller.packetTunnelTransport = packetTunnelTransport 193 } 194 195 return controller, nil 196 } 197 198 // Run executes the controller. Run exits if a controller 199 // component fails or the parent context is canceled. 200 func (controller *Controller) Run(ctx context.Context) { 201 202 if controller.config.LimitCPUThreads { 203 runtime.GOMAXPROCS(1) 204 } 205 206 pprofRun() 207 208 // Ensure fresh repetitive notice state for each run, so the 209 // client will always get an AvailableEgressRegions notice, 210 // an initial instance of any repetitive error notice, etc. 211 ResetRepetitiveNotices() 212 213 runCtx, stopRunning := context.WithCancel(ctx) 214 defer stopRunning() 215 216 controller.runCtx = runCtx 217 controller.stopRunning = stopRunning 218 219 // Start components 220 221 // Initialize a single resolver to be used by all dials. Sharing a single 222 // resolver ensures cached results are shared, and that network state 223 // query overhead is amortized over all dials. Multiple dials can resolve 224 // domain concurrently. 225 // 226 // config.SetResolver makes this resolver available to MakeDialParameters. 227 controller.resolver = NewResolver(controller.config, true) 228 defer controller.resolver.Stop() 229 controller.config.SetResolver(controller.resolver) 230 231 // TODO: IPv6 support 232 var listenIP string 233 if controller.config.ListenInterface == "" { 234 listenIP = "127.0.0.1" 235 } else if controller.config.ListenInterface == "any" { 236 listenIP = "0.0.0.0" 237 } else { 238 IPv4Address, _, err := common.GetInterfaceIPAddresses(controller.config.ListenInterface) 239 if err == nil && IPv4Address == nil { 240 err = fmt.Errorf("no IPv4 address for interface %s", controller.config.ListenInterface) 241 } 242 if err != nil { 243 NoticeError("error getting listener IP: %v", errors.Trace(err)) 244 return 245 } 246 listenIP = IPv4Address.String() 247 } 248 249 if !controller.config.DisableLocalSocksProxy { 250 socksProxy, err := NewSocksProxy(controller.config, controller, listenIP) 251 if err != nil { 252 NoticeError("error initializing local SOCKS proxy: %v", errors.Trace(err)) 253 return 254 } 255 defer socksProxy.Close() 256 } 257 258 if !controller.config.DisableLocalHTTPProxy { 259 httpProxy, err := NewHttpProxy(controller.config, controller, listenIP) 260 if err != nil { 261 NoticeError("error initializing local HTTP proxy: %v", errors.Trace(err)) 262 return 263 } 264 defer httpProxy.Close() 265 } 266 267 if !controller.config.DisableRemoteServerListFetcher { 268 269 if controller.config.RemoteServerListURLs != nil { 270 controller.runWaitGroup.Add(1) 271 go controller.remoteServerListFetcher( 272 "common", 273 FetchCommonRemoteServerList, 274 controller.signalFetchCommonRemoteServerList) 275 } 276 277 if controller.config.ObfuscatedServerListRootURLs != nil { 278 controller.runWaitGroup.Add(1) 279 go controller.remoteServerListFetcher( 280 "obfuscated", 281 FetchObfuscatedServerLists, 282 controller.signalFetchObfuscatedServerLists) 283 } 284 } 285 286 if controller.config.UpgradeDownloadURLs != nil { 287 controller.runWaitGroup.Add(1) 288 go controller.upgradeDownloader() 289 } 290 291 controller.runWaitGroup.Add(1) 292 go controller.serverEntriesReporter() 293 294 controller.runWaitGroup.Add(1) 295 go controller.connectedReporter() 296 297 controller.runWaitGroup.Add(1) 298 go controller.establishTunnelWatcher() 299 300 controller.runWaitGroup.Add(1) 301 go controller.runTunnels() 302 303 if controller.packetTunnelClient != nil { 304 controller.packetTunnelClient.Start() 305 } 306 307 // Wait while running 308 309 <-controller.runCtx.Done() 310 NoticeInfo("controller stopped") 311 312 if controller.packetTunnelClient != nil { 313 controller.packetTunnelClient.Stop() 314 } 315 316 // All workers -- runTunnels, establishment workers, and auxilliary 317 // workers such as fetch remote server list and untunneled uprade 318 // download -- operate with the controller run context and will all 319 // be interrupted when the run context is done. 320 321 controller.runWaitGroup.Wait() 322 323 NoticeInfo("exiting controller") 324 325 NoticeExiting() 326 } 327 328 // SignalComponentFailure notifies the controller that an associated component has failed. 329 // This will terminate the controller. 330 func (controller *Controller) SignalComponentFailure() { 331 NoticeWarning("controller shutdown due to component failure") 332 controller.stopRunning() 333 } 334 335 // SetDynamicConfig overrides the sponsor ID and authorizations fields of the 336 // Controller config with the input values. The new values will be used in the 337 // next tunnel connection. 338 func (controller *Controller) SetDynamicConfig(sponsorID string, authorizations []string) { 339 controller.config.SetDynamicConfig(sponsorID, authorizations) 340 } 341 342 // TerminateNextActiveTunnel terminates the active tunnel, which will initiate 343 // establishment of a new tunnel. 344 func (controller *Controller) TerminateNextActiveTunnel() { 345 tunnel := controller.getNextActiveTunnel() 346 if tunnel != nil { 347 controller.SignalTunnelFailure(tunnel) 348 NoticeInfo("terminated tunnel: %s", tunnel.dialParams.ServerEntry.GetDiagnosticID()) 349 } 350 } 351 352 // ExportExchangePayload creates a payload for client-to-client server 353 // connection info exchange. See the comment for psiphon.ExportExchangePayload 354 // for more details. 355 func (controller *Controller) ExportExchangePayload() string { 356 return ExportExchangePayload(controller.config) 357 } 358 359 // ImportExchangePayload imports a payload generated by ExportExchangePayload. 360 // See the comment for psiphon.ImportExchangePayload for more details about 361 // the import. 362 // 363 // When the import is successful, a signal is set to trigger a restart any 364 // establishment in progress. This will cause the newly imported server entry 365 // to be prioritized, which it otherwise would not be in later establishment 366 // rounds. The establishment process continues after ImportExchangePayload 367 // returns. 368 // 369 // If the client already has a connected tunnel, or a tunnel connection is 370 // established concurrently with the import, the signal has no effect as the 371 // overall goal is establish _any_ connection. 372 func (controller *Controller) ImportExchangePayload(payload string) bool { 373 374 // Race condition: if a new tunnel connection is established concurrently 375 // with the import, either that tunnel's server entry of the imported server 376 // entry may end up as the affinity server. 377 378 ok := ImportExchangePayload(controller.config, payload) 379 if !ok { 380 return false 381 } 382 383 select { 384 case controller.signalRestartEstablishing <- struct{}{}: 385 default: 386 } 387 388 return true 389 } 390 391 // remoteServerListFetcher fetches an out-of-band list of server entries 392 // for more tunnel candidates. It fetches when signalled, with retries 393 // on failure. 394 func (controller *Controller) remoteServerListFetcher( 395 name string, 396 fetcher RemoteServerListFetcher, 397 signal <-chan struct{}) { 398 399 defer controller.runWaitGroup.Done() 400 401 var lastFetchTime time.Time 402 403 fetcherLoop: 404 for { 405 // Wait for a signal before fetching 406 select { 407 case <-signal: 408 case <-controller.runCtx.Done(): 409 break fetcherLoop 410 } 411 412 // Skip fetch entirely (i.e., send no request at all, even when ETag would save 413 // on response size) when a recent fetch was successful 414 415 stalePeriod := controller.config.GetParameters().Get().Duration( 416 parameters.FetchRemoteServerListStalePeriod) 417 418 if !lastFetchTime.IsZero() && 419 lastFetchTime.Add(stalePeriod).After(time.Now()) { 420 continue 421 } 422 423 retryLoop: 424 for attempt := 0; ; attempt++ { 425 // Don't attempt to fetch while there is no network connectivity, 426 // to avoid alert notice noise. 427 if !WaitForNetworkConnectivity( 428 controller.runCtx, 429 controller.config.NetworkConnectivityChecker) { 430 break fetcherLoop 431 } 432 433 // Pick any active tunnel and make the next fetch attempt. If there's 434 // no active tunnel, the untunneledDialConfig will be used. 435 tunnel := controller.getNextActiveTunnel() 436 437 err := fetcher( 438 controller.runCtx, 439 controller.config, 440 attempt, 441 tunnel, 442 controller.untunneledDialConfig) 443 444 if err == nil { 445 lastFetchTime = time.Now() 446 break retryLoop 447 } 448 449 NoticeWarning("failed to fetch %s remote server list: %v", 450 name, errors.Trace(err)) 451 452 retryPeriod := controller.config.GetParameters().Get().Duration( 453 parameters.FetchRemoteServerListRetryPeriod) 454 455 timer := time.NewTimer(retryPeriod) 456 select { 457 case <-timer.C: 458 case <-controller.runCtx.Done(): 459 timer.Stop() 460 break fetcherLoop 461 } 462 } 463 } 464 465 NoticeInfo("exiting %s remote server list fetcher", name) 466 } 467 468 // upgradeDownloader makes periodic attempts to complete a client upgrade 469 // download. DownloadUpgrade() is resumable, so each attempt has potential for 470 // getting closer to completion, even in conditions where the download or 471 // tunnel is repeatedly interrupted. 472 // An upgrade download is triggered by either a handshake response indicating 473 // that a new version is available; or after failing to connect, in which case 474 // it's useful to check, out-of-band, for an upgrade with new circumvention 475 // capabilities. 476 // Once the download operation completes successfully, the downloader exits 477 // and is not run again: either there is not a newer version, or the upgrade 478 // has been downloaded and is ready to be applied. 479 // We're assuming that the upgrade will be applied and the entire system 480 // restarted before another upgrade is to be downloaded. 481 // 482 // TODO: refactor upgrade downloader and remote server list fetcher to use 483 // common code (including the resumable download routines). 484 // 485 func (controller *Controller) upgradeDownloader() { 486 defer controller.runWaitGroup.Done() 487 488 var lastDownloadTime time.Time 489 490 downloadLoop: 491 for { 492 // Wait for a signal before downloading 493 var handshakeVersion string 494 select { 495 case handshakeVersion = <-controller.signalDownloadUpgrade: 496 case <-controller.runCtx.Done(): 497 break downloadLoop 498 } 499 500 stalePeriod := controller.config.GetParameters().Get().Duration( 501 parameters.FetchUpgradeStalePeriod) 502 503 // Unless handshake is explicitly advertizing a new version, skip 504 // checking entirely when a recent download was successful. 505 if handshakeVersion == "" && 506 !lastDownloadTime.IsZero() && 507 lastDownloadTime.Add(stalePeriod).After(time.Now()) { 508 continue 509 } 510 511 retryLoop: 512 for attempt := 0; ; attempt++ { 513 // Don't attempt to download while there is no network connectivity, 514 // to avoid alert notice noise. 515 if !WaitForNetworkConnectivity( 516 controller.runCtx, 517 controller.config.NetworkConnectivityChecker) { 518 break downloadLoop 519 } 520 521 // Pick any active tunnel and make the next download attempt. If there's 522 // no active tunnel, the untunneledDialConfig will be used. 523 tunnel := controller.getNextActiveTunnel() 524 525 err := DownloadUpgrade( 526 controller.runCtx, 527 controller.config, 528 attempt, 529 handshakeVersion, 530 tunnel, 531 controller.untunneledDialConfig) 532 533 if err == nil { 534 lastDownloadTime = time.Now() 535 break retryLoop 536 } 537 538 NoticeWarning("failed to download upgrade: %v", errors.Trace(err)) 539 540 timeout := controller.config.GetParameters().Get().Duration( 541 parameters.FetchUpgradeRetryPeriod) 542 543 timer := time.NewTimer(timeout) 544 select { 545 case <-timer.C: 546 case <-controller.runCtx.Done(): 547 timer.Stop() 548 break downloadLoop 549 } 550 } 551 } 552 553 NoticeInfo("exiting upgrade downloader") 554 } 555 556 type serverEntriesReportRequest struct { 557 constraints *protocolSelectionConstraints 558 awaitResponse chan *serverEntriesReportResponse 559 } 560 561 type serverEntriesReportResponse struct { 562 err error 563 candidates int 564 initialCandidates int 565 initialCandidatesAnyEgressRegion int 566 availableEgressRegions []string 567 } 568 569 // serverEntriesReporter performs scans over all server entries to report on 570 // available tunnel candidates, subject to protocol selection constraints, and 571 // available egress regions. 572 // 573 // Because scans may be slow, depending on the client device and server entry 574 // list size, serverEntriesReporter is used to perform asychronous, background 575 // operations that would otherwise block establishment. This includes emitting 576 // diagnotic notices that are informational (CandidateServers) or which do not 577 // need to emit before establishment starts (AvailableEgressRegions). 578 // 579 // serverEntriesReporter also serves to combine these scans, which would 580 // otherwise be logically independent, due to the performance impact of scans. 581 // 582 // The underlying datastore implementation _may_ block write transactions 583 // while there are open read transactions. For example, bolt write 584 // transactions which need to re-map the data file (when the datastore grows) 585 // will block on open read transactions. In these scenarios, a slow scan will 586 // still block other operations. 587 // 588 // serverEntriesReporter runs beyond the establishment phase, since it's 589 // important for notices such as AvailableEgressRegions to eventually emit 590 // even if already established. serverEntriesReporter scans are cancellable, 591 // so controller shutdown is not blocked by slow scans. 592 // 593 // In some special cases, establishment cannot begin without candidate counts 594 // up front. In these cases only, the request contains a non-nil 595 // awaitResponse, a channel which is used by the requester to block until the 596 // scan is complete and the candidate counts are available. 597 func (controller *Controller) serverEntriesReporter() { 598 defer controller.runWaitGroup.Done() 599 600 loop: 601 for { 602 603 var request *serverEntriesReportRequest 604 605 select { 606 case request = <-controller.signalReportServerEntries: 607 case <-controller.runCtx.Done(): 608 break loop 609 } 610 611 egressRegion := controller.config.EgressRegion 612 constraints := request.constraints 613 614 var response serverEntriesReportResponse 615 616 regions := make(map[string]bool) 617 618 callback := func(serverEntry *protocol.ServerEntry) bool { 619 620 // In establishment, excludeIntensive depends on what set of protocols are 621 // already being dialed. For these reports, don't exclude intensive 622 // protocols as any intensive candidate can always be an available 623 // candidate at some point. 624 excludeIntensive := false 625 626 isInitialCandidate := constraints.isInitialCandidate(excludeIntensive, serverEntry) 627 isCandidate := constraints.isCandidate(excludeIntensive, serverEntry) 628 629 if isInitialCandidate { 630 response.initialCandidatesAnyEgressRegion += 1 631 } 632 633 if egressRegion == "" || serverEntry.Region == egressRegion { 634 if isInitialCandidate { 635 response.initialCandidates += 1 636 } 637 if isCandidate { 638 response.candidates += 1 639 } 640 } 641 642 isAvailable := isCandidate 643 if constraints.hasInitialProtocols() { 644 // Available egress regions is subject to an initial limit constraint, if 645 // present: see AvailableEgressRegions comment in launchEstablishing. 646 isAvailable = isInitialCandidate 647 } 648 649 if isAvailable { 650 // Ignore server entries with no region field. 651 if serverEntry.Region != "" { 652 regions[serverEntry.Region] = true 653 } 654 } 655 656 select { 657 case <-controller.runCtx.Done(): 658 // Don't block controller shutdown: cancel the scan. 659 return false 660 default: 661 return true 662 } 663 } 664 665 startTime := time.Now() 666 667 response.err = ScanServerEntries(callback) 668 669 // Report this duration in CandidateServers as an indication of datastore 670 // performance. 671 duration := time.Since(startTime) 672 673 response.availableEgressRegions = make([]string, 0, len(regions)) 674 for region := range regions { 675 response.availableEgressRegions = append(response.availableEgressRegions, region) 676 } 677 678 if response.err != nil { 679 680 // For diagnostics, we'll post this even when cancelled due to shutdown. 681 NoticeWarning("ScanServerEntries failed: %v", errors.Trace(response.err)) 682 683 // Continue and send error reponse. Clear any partial data to avoid 684 // misuse. 685 response.candidates = 0 686 response.initialCandidates = 0 687 response.initialCandidatesAnyEgressRegion = 0 688 response.availableEgressRegions = []string{} 689 } 690 691 if request.awaitResponse != nil { 692 select { 693 case request.awaitResponse <- &response: 694 case <-controller.runCtx.Done(): 695 // The receiver may be gone when shutting down. 696 } 697 } 698 699 if response.err == nil { 700 701 NoticeCandidateServers( 702 controller.config.EgressRegion, 703 controller.protocolSelectionConstraints, 704 response.initialCandidates, 705 response.candidates, 706 duration) 707 708 NoticeAvailableEgressRegions( 709 response.availableEgressRegions) 710 } 711 } 712 713 NoticeInfo("exiting server entries reporter") 714 } 715 716 // signalServerEntriesReporter triggers a new server entry report. Set 717 // request.awaitResponse to obtain the report output. When awaitResponse is 718 // set, signalServerEntriesReporter blocks until the reporter receives the 719 // request, guaranteeing the new report runs. Otherwise, the report is 720 // considered to be informational and may or may not run, depending on whether 721 // another run is already in progress. 722 func (controller *Controller) signalServerEntriesReporter(request *serverEntriesReportRequest) { 723 724 if request.awaitResponse == nil { 725 select { 726 case controller.signalReportServerEntries <- request: 727 default: 728 } 729 } else { 730 controller.signalReportServerEntries <- request 731 } 732 } 733 734 // connectedReporter sends periodic "connected" requests to the Psiphon API. 735 // These requests are for server-side unique user stats calculation. See the 736 // comment in DoConnectedRequest for a description of the request mechanism. 737 // 738 // To correctly count daily unique users, only one connected request is made 739 // across all simultaneous multi-tunnels; and the connected request is 740 // repeated every 24h. 741 // 742 // The signalReportConnected mechanism is used to trigger a connected request 743 // immediately after a reconnect. While strictly only one connected request 744 // per 24h is required in order to count daily unique users, the connected 745 // request also delivers the establishment duration metric (which includes 746 // time elapsed performing the handshake request) and additional fragmentation 747 // metrics; these metrics are measured for each tunnel. 748 func (controller *Controller) connectedReporter() { 749 defer controller.runWaitGroup.Done() 750 751 // session is nil when DisableApi is set 752 if controller.config.DisableApi { 753 return 754 } 755 756 select { 757 case <-controller.signalReportConnected: 758 // Make the initial connected request 759 case <-controller.runCtx.Done(): 760 return 761 } 762 763 loop: 764 for { 765 766 // Pick any active tunnel and make the next connected request. No error is 767 // logged if there's no active tunnel, as that's not an unexpected 768 // condition. 769 reported := false 770 tunnel := controller.getNextActiveTunnel() 771 if tunnel != nil { 772 err := tunnel.serverContext.DoConnectedRequest() 773 if err == nil { 774 reported = true 775 } else { 776 NoticeWarning("failed to make connected request: %v", 777 errors.Trace(err)) 778 } 779 } 780 781 // Schedule the next connected request and wait. This duration is not a 782 // dynamic ClientParameter as the daily unique user stats logic specifically 783 // requires a "connected" request no more or less often than every 24h. 784 var duration time.Duration 785 if reported { 786 duration = 24 * time.Hour 787 } else { 788 duration = controller.config.GetParameters().Get().Duration( 789 parameters.PsiphonAPIConnectedRequestRetryPeriod) 790 } 791 timer := time.NewTimer(duration) 792 doBreak := false 793 select { 794 case <-controller.signalReportConnected: 795 case <-timer.C: 796 // Make another connected request 797 case <-controller.runCtx.Done(): 798 doBreak = true 799 } 800 timer.Stop() 801 if doBreak { 802 break loop 803 } 804 } 805 806 NoticeInfo("exiting connected reporter") 807 } 808 809 func (controller *Controller) signalConnectedReporter() { 810 811 // session is nil when DisableApi is set 812 if controller.config.DisableApi { 813 return 814 } 815 816 select { 817 case controller.signalReportConnected <- struct{}{}: 818 default: 819 } 820 } 821 822 // establishTunnelWatcher terminates the controller if a tunnel 823 // has not been established in the configured time period. This 824 // is regardless of how many tunnels are presently active -- meaning 825 // that if an active tunnel was established and lost the controller 826 // is left running (to re-establish). 827 func (controller *Controller) establishTunnelWatcher() { 828 defer controller.runWaitGroup.Done() 829 830 timeout := controller.config.GetParameters().Get().Duration( 831 parameters.EstablishTunnelTimeout) 832 833 if timeout > 0 { 834 timer := time.NewTimer(timeout) 835 defer timer.Stop() 836 837 select { 838 case <-timer.C: 839 if !controller.hasEstablishedOnce() { 840 NoticeEstablishTunnelTimeout(timeout) 841 controller.SignalComponentFailure() 842 } 843 case <-controller.runCtx.Done(): 844 } 845 } 846 847 NoticeInfo("exiting establish tunnel watcher") 848 } 849 850 // runTunnels is the controller tunnel management main loop. It starts and stops 851 // establishing tunnels based on the target tunnel pool size and the current size 852 // of the pool. Tunnels are established asynchronously using worker goroutines. 853 // 854 // When there are no server entries for the target region/protocol, the 855 // establishCandidateGenerator will yield no candidates and wait before 856 // trying again. In the meantime, a remote server entry fetch may supply 857 // valid candidates. 858 // 859 // When a tunnel is established, it's added to the active pool. The tunnel's 860 // operateTunnel goroutine monitors the tunnel. 861 // 862 // When a tunnel fails, it's removed from the pool and the establish process is 863 // restarted to fill the pool. 864 func (controller *Controller) runTunnels() { 865 defer controller.runWaitGroup.Done() 866 867 // Start running 868 869 controller.startEstablishing() 870 loop: 871 for { 872 select { 873 874 case <-controller.signalRestartEstablishing: 875 876 // signalRestartEstablishing restarts any establishment in progress. One 877 // use case for this is to prioritize a newly imported, exchanged server 878 // entry, which will be in the affinity position. 879 // 880 // It's possible for another connection to establish concurrent to signalling; 881 // since the overall goal remains to establish _any_ connection, we accept that 882 // in some cases the exchanged server entry may not get used. 883 884 if controller.isEstablishing { 885 controller.stopEstablishing() 886 controller.startEstablishing() 887 } 888 889 case failedTunnel := <-controller.failedTunnels: 890 NoticeWarning("tunnel failed: %s", failedTunnel.dialParams.ServerEntry.GetDiagnosticID()) 891 controller.terminateTunnel(failedTunnel) 892 893 // Clear the reference to this tunnel before calling startEstablishing, 894 // which will invoke a garbage collection. 895 failedTunnel = nil 896 897 // Concurrency note: only this goroutine may call startEstablishing/stopEstablishing, 898 // which reference controller.isEstablishing. 899 controller.startEstablishing() 900 901 case connectedTunnel := <-controller.connectedTunnels: 902 903 // Tunnel establishment has two phases: connection and activation. 904 // 905 // Connection is run concurrently by the establishTunnelWorkers, to minimize 906 // delay when it's not yet known which server and protocol will be available 907 // and unblocked. 908 // 909 // Activation is run serially, here, to minimize the overhead of making a 910 // handshake request and starting the operateTunnel management worker for a 911 // tunnel which may be discarded. 912 // 913 // When the active tunnel will complete establishment, establishment is 914 // stopped before activation. This interrupts all connecting tunnels and 915 // garbage collects their memory. The purpose is to minimize memory 916 // pressure when the handshake request is made. In the unlikely case that the 917 // handshake fails, establishment is restarted. 918 // 919 // Any delays in stopEstablishing will delay the handshake for the last 920 // active tunnel. 921 // 922 // In the typical case of tunnelPoolSize of 1, only a single handshake is 923 // performed and the homepages notices file, when used, will not be modifed 924 // after the NoticeTunnels(1) [i.e., connected] until NoticeTunnels(0) [i.e., 925 // disconnected]. For tunnelPoolSize > 1, serial handshakes only ensures that 926 // each set of emitted NoticeHomepages is contiguous. 927 928 active, outstanding := controller.numTunnels() 929 930 // discardTunnel will be true here when already fully established. 931 932 discardTunnel := (outstanding <= 0) 933 isFirstTunnel := (active == 0) 934 isLastTunnel := (outstanding == 1) 935 936 if !discardTunnel { 937 938 if isLastTunnel { 939 controller.stopEstablishing() 940 } 941 942 err := connectedTunnel.Activate(controller.runCtx, controller) 943 944 if err != nil { 945 NoticeWarning("failed to activate %s: %v", 946 connectedTunnel.dialParams.ServerEntry.GetDiagnosticID(), 947 errors.Trace(err)) 948 discardTunnel = true 949 } else { 950 // It's unlikely that registerTunnel will fail, since only this goroutine 951 // calls registerTunnel -- and after checking numTunnels; so failure is not 952 // expected. 953 if !controller.registerTunnel(connectedTunnel) { 954 NoticeWarning("failed to register %s: %v", 955 connectedTunnel.dialParams.ServerEntry.GetDiagnosticID(), 956 errors.Trace(err)) 957 discardTunnel = true 958 } 959 } 960 961 // May need to replace this tunnel 962 if isLastTunnel && discardTunnel { 963 controller.startEstablishing() 964 } 965 966 } 967 968 if discardTunnel { 969 controller.discardTunnel(connectedTunnel) 970 971 // Clear the reference to this discarded tunnel and immediately run 972 // a garbage collection to reclaim its memory. 973 connectedTunnel = nil 974 DoGarbageCollection() 975 976 // Skip the rest of this case 977 break 978 } 979 980 atomic.AddInt32(&controller.establishedTunnelsCount, 1) 981 982 NoticeActiveTunnel( 983 connectedTunnel.dialParams.ServerEntry.GetDiagnosticID(), 984 connectedTunnel.dialParams.TunnelProtocol, 985 connectedTunnel.dialParams.ServerEntry.SupportsSSHAPIRequests()) 986 987 if isFirstTunnel { 988 989 // Signal a connected request on each 1st tunnel establishment. For 990 // multi-tunnels, the session is connected as long as at least one 991 // tunnel is established. 992 controller.signalConnectedReporter() 993 994 // If the handshake indicated that a new client version is available, 995 // trigger an upgrade download. 996 // Note: serverContext is nil when DisableApi is set 997 if connectedTunnel.serverContext != nil && 998 connectedTunnel.serverContext.clientUpgradeVersion != "" { 999 1000 handshakeVersion := connectedTunnel.serverContext.clientUpgradeVersion 1001 select { 1002 case controller.signalDownloadUpgrade <- handshakeVersion: 1003 default: 1004 } 1005 } 1006 } 1007 1008 // Set the new tunnel as the transport for the packet tunnel. The packet tunnel 1009 // client remains up when reestablishing, but no packets are relayed while there 1010 // is no connected tunnel. UseTunnel will establish a new packet tunnel SSH 1011 // channel over the new SSH tunnel and configure the packet tunnel client to use 1012 // the new SSH channel as its transport. 1013 // 1014 // Note: as is, this logic is suboptimal for tunnelPoolSize > 1, as this would 1015 // continuously initialize new packet tunnel sessions for each established 1016 // server. For now, config validation requires tunnelPoolSize == 1 when 1017 // the packet tunnel is used. 1018 1019 if controller.packetTunnelTransport != nil { 1020 controller.packetTunnelTransport.UseTunnel(connectedTunnel) 1021 } 1022 1023 if controller.isFullyEstablished() { 1024 controller.stopEstablishing() 1025 } 1026 1027 case <-controller.runCtx.Done(): 1028 break loop 1029 } 1030 } 1031 1032 // Stop running 1033 1034 controller.stopEstablishing() 1035 controller.terminateAllTunnels() 1036 1037 // Drain tunnel channels 1038 close(controller.connectedTunnels) 1039 for tunnel := range controller.connectedTunnels { 1040 controller.discardTunnel(tunnel) 1041 } 1042 close(controller.failedTunnels) 1043 for tunnel := range controller.failedTunnels { 1044 controller.discardTunnel(tunnel) 1045 } 1046 1047 NoticeInfo("exiting run tunnels") 1048 } 1049 1050 // SignalSeededNewSLOK implements the TunnelOwner interface. This function 1051 // is called by Tunnel.operateTunnel when the tunnel has received a new, 1052 // previously unknown SLOK from the server. The Controller triggers an OSL 1053 // fetch, as the new SLOK may be sufficient to access new OSLs. 1054 func (controller *Controller) SignalSeededNewSLOK() { 1055 select { 1056 case controller.signalFetchObfuscatedServerLists <- struct{}{}: 1057 default: 1058 } 1059 } 1060 1061 // SignalTunnelFailure implements the TunnelOwner interface. This function 1062 // is called by Tunnel.operateTunnel when the tunnel has detected that it 1063 // has failed. The Controller will signal runTunnels to create a new 1064 // tunnel and/or remove the tunnel from the list of active tunnels. 1065 func (controller *Controller) SignalTunnelFailure(tunnel *Tunnel) { 1066 // Don't block. Assumes the receiver has a buffer large enough for 1067 // the typical number of operated tunnels. In case there's no room, 1068 // terminate the tunnel (runTunnels won't get a signal in this case, 1069 // but the tunnel will be removed from the list of active tunnels). 1070 select { 1071 case controller.failedTunnels <- tunnel: 1072 default: 1073 controller.terminateTunnel(tunnel) 1074 } 1075 } 1076 1077 // discardTunnel disposes of a successful connection that is no longer required. 1078 func (controller *Controller) discardTunnel(tunnel *Tunnel) { 1079 NoticeInfo("discard tunnel: %s", tunnel.dialParams.ServerEntry.GetDiagnosticID()) 1080 // TODO: not calling PromoteServerEntry, since that would rank the 1081 // discarded tunnel before fully active tunnels. Can a discarded tunnel 1082 // be promoted (since it connects), but with lower rank than all active 1083 // tunnels? 1084 tunnel.Close(true) 1085 } 1086 1087 // registerTunnel adds the connected tunnel to the pool of active tunnels 1088 // which are candidates for port forwarding. Returns true if the pool has an 1089 // empty slot and false if the pool is full (caller should discard the tunnel). 1090 func (controller *Controller) registerTunnel(tunnel *Tunnel) bool { 1091 controller.tunnelMutex.Lock() 1092 defer controller.tunnelMutex.Unlock() 1093 if len(controller.tunnels) >= controller.tunnelPoolSize { 1094 return false 1095 } 1096 // Perform a final check just in case we've established 1097 // a duplicate connection. 1098 for _, activeTunnel := range controller.tunnels { 1099 if activeTunnel.dialParams.ServerEntry.IpAddress == 1100 tunnel.dialParams.ServerEntry.IpAddress { 1101 1102 NoticeWarning("duplicate tunnel: %s", tunnel.dialParams.ServerEntry.GetDiagnosticID()) 1103 return false 1104 } 1105 } 1106 controller.establishedOnce = true 1107 controller.tunnels = append(controller.tunnels, tunnel) 1108 NoticeTunnels(len(controller.tunnels)) 1109 1110 // Promote this successful tunnel to first rank so it's one 1111 // of the first candidates next time establish runs. 1112 // Connecting to a TargetServerEntry does not change the 1113 // ranking. 1114 if controller.config.TargetServerEntry == "" { 1115 PromoteServerEntry(controller.config, tunnel.dialParams.ServerEntry.IpAddress) 1116 } 1117 1118 return true 1119 } 1120 1121 // hasEstablishedOnce indicates if at least one active tunnel has 1122 // been established up to this point. This is regardeless of how many 1123 // tunnels are presently active. 1124 func (controller *Controller) hasEstablishedOnce() bool { 1125 controller.tunnelMutex.Lock() 1126 defer controller.tunnelMutex.Unlock() 1127 return controller.establishedOnce 1128 } 1129 1130 // isFullyEstablished indicates if the pool of active tunnels is full. 1131 func (controller *Controller) isFullyEstablished() bool { 1132 controller.tunnelMutex.Lock() 1133 defer controller.tunnelMutex.Unlock() 1134 return len(controller.tunnels) >= controller.tunnelPoolSize 1135 } 1136 1137 // numTunnels returns the number of active and outstanding tunnels. 1138 // Oustanding is the number of tunnels required to fill the pool of 1139 // active tunnels. 1140 func (controller *Controller) numTunnels() (int, int) { 1141 controller.tunnelMutex.Lock() 1142 defer controller.tunnelMutex.Unlock() 1143 active := len(controller.tunnels) 1144 outstanding := controller.tunnelPoolSize - len(controller.tunnels) 1145 return active, outstanding 1146 } 1147 1148 // terminateTunnel removes a tunnel from the pool of active tunnels 1149 // and closes the tunnel. The next-tunnel state used by getNextActiveTunnel 1150 // is adjusted as required. 1151 func (controller *Controller) terminateTunnel(tunnel *Tunnel) { 1152 controller.tunnelMutex.Lock() 1153 defer controller.tunnelMutex.Unlock() 1154 for index, activeTunnel := range controller.tunnels { 1155 if tunnel == activeTunnel { 1156 controller.tunnels = append( 1157 controller.tunnels[:index], controller.tunnels[index+1:]...) 1158 if controller.nextTunnel > index { 1159 controller.nextTunnel-- 1160 } 1161 if controller.nextTunnel >= len(controller.tunnels) { 1162 controller.nextTunnel = 0 1163 } 1164 activeTunnel.Close(false) 1165 NoticeTunnels(len(controller.tunnels)) 1166 break 1167 } 1168 } 1169 } 1170 1171 // terminateAllTunnels empties the tunnel pool, closing all active tunnels. 1172 // This is used when shutting down the controller. 1173 func (controller *Controller) terminateAllTunnels() { 1174 controller.tunnelMutex.Lock() 1175 defer controller.tunnelMutex.Unlock() 1176 // Closing all tunnels in parallel. In an orderly shutdown, each tunnel 1177 // may take a few seconds to send a final status request. We only want 1178 // to wait as long as the single slowest tunnel. 1179 closeWaitGroup := new(sync.WaitGroup) 1180 closeWaitGroup.Add(len(controller.tunnels)) 1181 for _, activeTunnel := range controller.tunnels { 1182 tunnel := activeTunnel 1183 go func() { 1184 defer closeWaitGroup.Done() 1185 tunnel.Close(false) 1186 }() 1187 } 1188 closeWaitGroup.Wait() 1189 controller.tunnels = make([]*Tunnel, 0) 1190 controller.nextTunnel = 0 1191 NoticeTunnels(len(controller.tunnels)) 1192 } 1193 1194 // getNextActiveTunnel returns the next tunnel from the pool of active 1195 // tunnels. Currently, tunnel selection order is simple round-robin. 1196 func (controller *Controller) getNextActiveTunnel() (tunnel *Tunnel) { 1197 controller.tunnelMutex.Lock() 1198 defer controller.tunnelMutex.Unlock() 1199 if len(controller.tunnels) == 0 { 1200 return nil 1201 } 1202 tunnel = controller.tunnels[controller.nextTunnel] 1203 controller.nextTunnel = 1204 (controller.nextTunnel + 1) % len(controller.tunnels) 1205 return tunnel 1206 } 1207 1208 // isActiveTunnelServerEntry is used to check if there's already 1209 // an existing tunnel to a candidate server. 1210 func (controller *Controller) isActiveTunnelServerEntry( 1211 serverEntry *protocol.ServerEntry) bool { 1212 1213 controller.tunnelMutex.Lock() 1214 defer controller.tunnelMutex.Unlock() 1215 for _, activeTunnel := range controller.tunnels { 1216 if activeTunnel.dialParams.ServerEntry.IpAddress == serverEntry.IpAddress { 1217 return true 1218 } 1219 } 1220 return false 1221 } 1222 1223 func (controller *Controller) setTunnelPoolSize(tunnelPoolSize int) { 1224 controller.tunnelMutex.Lock() 1225 defer controller.tunnelMutex.Unlock() 1226 if tunnelPoolSize < 1 { 1227 tunnelPoolSize = 1 1228 } 1229 if tunnelPoolSize > MAX_TUNNEL_POOL_SIZE { 1230 tunnelPoolSize = MAX_TUNNEL_POOL_SIZE 1231 } 1232 controller.tunnelPoolSize = tunnelPoolSize 1233 } 1234 1235 func (controller *Controller) getTunnelPoolSize() int { 1236 controller.tunnelMutex.Lock() 1237 defer controller.tunnelMutex.Unlock() 1238 return controller.tunnelPoolSize 1239 } 1240 1241 // Dial selects an active tunnel and establishes a port forward 1242 // connection through the selected tunnel. Failure to connect is considered 1243 // a port forward failure, for the purpose of monitoring tunnel health. 1244 // 1245 // When split tunnel mode is enabled, the connection may be untunneled, 1246 // depending on GeoIP classification of the destination. 1247 // 1248 // downstreamConn is an optional parameter which specifies a connection to be 1249 // explicitly closed when the dialed connection is closed. For instance, this 1250 // is used to close downstreamConn App<->LocalProxy connections when the 1251 // related LocalProxy<->SshPortForward connections close. 1252 func (controller *Controller) Dial( 1253 remoteAddr string, downstreamConn net.Conn) (conn net.Conn, err error) { 1254 1255 tunnel := controller.getNextActiveTunnel() 1256 if tunnel == nil { 1257 return nil, errors.TraceNew("no active tunnels") 1258 } 1259 1260 if !tunnel.config.IsSplitTunnelEnabled() { 1261 1262 tunneledConn, splitTunnel, err := tunnel.DialTCPChannel( 1263 remoteAddr, false, downstreamConn) 1264 if err != nil { 1265 return nil, errors.Trace(err) 1266 } 1267 1268 if splitTunnel { 1269 return nil, errors.TraceNew( 1270 "unexpected split tunnel classification") 1271 } 1272 1273 return tunneledConn, nil 1274 } 1275 1276 // In split tunnel mode, TCP port forwards to destinations in the same 1277 // country as the client are untunneled. 1278 // 1279 // Split tunnel is implemented with assistence from the server to classify 1280 // destinations as being in the same country as the client. The server knows 1281 // the client's public IP GeoIP data, and, for clients with split tunnel mode 1282 // enabled, the server resolves the port forward destination address and 1283 // checks the destination IP GeoIP data. 1284 // 1285 // When the countries match, the server "rejects" the port forward with a 1286 // distinct response that indicates to the client that an untunneled port 1287 // foward should be established locally. 1288 // 1289 // The client maintains a classification cache that allows it to make 1290 // untunneled port forwards without requiring a round trip to the server. 1291 // Only destinations classified as untunneled are stored in the cache: a 1292 // destination classified as tunneled requires the same round trip as an 1293 // unknown destination. 1294 // 1295 // When the countries do not match, the server establishes a port forward, as 1296 // it does for all port forwards in non-split tunnel mode. There is no 1297 // additional round trip for tunneled port forwards. 1298 1299 splitTunnelHost, _, err := net.SplitHostPort(remoteAddr) 1300 if err != nil { 1301 return nil, errors.Trace(err) 1302 } 1303 1304 untunneledCache := controller.untunneledSplitTunnelClassifications 1305 1306 // If the destination hostname is in the untunneled split tunnel 1307 // classifications cache, skip the round trip to the server and do the 1308 // direct, untunneled dial immediately. 1309 _, cachedUntunneled := untunneledCache.Get(splitTunnelHost) 1310 1311 if !cachedUntunneled { 1312 1313 tunneledConn, splitTunnel, err := tunnel.DialTCPChannel( 1314 remoteAddr, false, downstreamConn) 1315 if err != nil { 1316 return nil, errors.Trace(err) 1317 } 1318 1319 if !splitTunnel { 1320 1321 // Clear any cached untunneled classification entry for this destination 1322 // hostname, as the server is now classifying it as tunneled. 1323 untunneledCache.Delete(splitTunnelHost) 1324 1325 return tunneledConn, nil 1326 } 1327 1328 // The server has indicated that the client should make a direct, 1329 // untunneled dial. Cache the classification to avoid this round trip in 1330 // the immediate future. 1331 untunneledCache.Add(splitTunnelHost, true, lrucache.DefaultExpiration) 1332 } 1333 1334 NoticeUntunneled(splitTunnelHost) 1335 1336 untunneledConn, err := controller.DirectDial(remoteAddr) 1337 if err != nil { 1338 return nil, errors.Trace(err) 1339 } 1340 1341 return untunneledConn, nil 1342 } 1343 1344 // DirectDial dials an untunneled TCP connection within the controller run context. 1345 func (controller *Controller) DirectDial(remoteAddr string) (conn net.Conn, err error) { 1346 return DialTCP(controller.runCtx, remoteAddr, controller.untunneledDialConfig) 1347 } 1348 1349 // triggerFetches signals RSL, OSL, and upgrade download fetchers to begin, if 1350 // not already running. triggerFetches is called when tunnel establishment 1351 // fails to complete within a deadline and in other cases where local 1352 // circumvention capabilities are lacking and we may require new server 1353 // entries or client versions with new capabilities. 1354 func (controller *Controller) triggerFetches() { 1355 1356 // Trigger a common remote server list fetch, since we may have failed 1357 // to connect with all known servers. Don't block sending signal, since 1358 // this signal may have already been sent. 1359 // Don't wait for fetch remote to succeed, since it may fail and 1360 // enter a retry loop and we're better off trying more known servers. 1361 // TODO: synchronize the fetch response, so it can be incorporated 1362 // into the server entry iterator as soon as available. 1363 select { 1364 case controller.signalFetchCommonRemoteServerList <- struct{}{}: 1365 default: 1366 } 1367 1368 // Trigger an OSL fetch in parallel. Both fetches are run in parallel 1369 // so that if one out of the common RLS and OSL set is large, it doesn't 1370 // doesn't entirely block fetching the other. 1371 select { 1372 case controller.signalFetchObfuscatedServerLists <- struct{}{}: 1373 default: 1374 } 1375 1376 // Trigger an out-of-band upgrade availability check and download. 1377 // Since we may have failed to connect, we may benefit from upgrading 1378 // to a new client version with new circumvention capabilities. 1379 select { 1380 case controller.signalDownloadUpgrade <- "": 1381 default: 1382 } 1383 } 1384 1385 type protocolSelectionConstraints struct { 1386 useUpstreamProxy bool 1387 initialLimitTunnelProtocols protocol.TunnelProtocols 1388 initialLimitTunnelProtocolsCandidateCount int 1389 limitTunnelProtocols protocol.TunnelProtocols 1390 limitTunnelDialPortNumbers protocol.TunnelProtocolPortLists 1391 limitQUICVersions protocol.QUICVersions 1392 replayCandidateCount int 1393 } 1394 1395 func (p *protocolSelectionConstraints) hasInitialProtocols() bool { 1396 return len(p.initialLimitTunnelProtocols) > 0 && p.initialLimitTunnelProtocolsCandidateCount > 0 1397 } 1398 1399 func (p *protocolSelectionConstraints) isInitialCandidate( 1400 excludeIntensive bool, 1401 serverEntry *protocol.ServerEntry) bool { 1402 1403 return p.hasInitialProtocols() && 1404 len(serverEntry.GetSupportedProtocols( 1405 conditionallyEnabledComponents{}, 1406 p.useUpstreamProxy, 1407 p.initialLimitTunnelProtocols, 1408 p.limitTunnelDialPortNumbers, 1409 p.limitQUICVersions, 1410 excludeIntensive)) > 0 1411 } 1412 1413 func (p *protocolSelectionConstraints) isCandidate( 1414 excludeIntensive bool, 1415 serverEntry *protocol.ServerEntry) bool { 1416 1417 return len(serverEntry.GetSupportedProtocols( 1418 conditionallyEnabledComponents{}, 1419 p.useUpstreamProxy, 1420 p.limitTunnelProtocols, 1421 p.limitTunnelDialPortNumbers, 1422 p.limitQUICVersions, 1423 excludeIntensive)) > 0 1424 } 1425 1426 func (p *protocolSelectionConstraints) canReplay( 1427 connectTunnelCount int, 1428 excludeIntensive bool, 1429 serverEntry *protocol.ServerEntry, 1430 replayProtocol string) bool { 1431 1432 if p.replayCandidateCount != -1 && connectTunnelCount > p.replayCandidateCount { 1433 return false 1434 } 1435 1436 return common.Contains( 1437 p.supportedProtocols(connectTunnelCount, excludeIntensive, serverEntry), 1438 replayProtocol) 1439 } 1440 1441 func (p *protocolSelectionConstraints) supportedProtocols( 1442 connectTunnelCount int, 1443 excludeIntensive bool, 1444 serverEntry *protocol.ServerEntry) []string { 1445 1446 limitTunnelProtocols := p.limitTunnelProtocols 1447 1448 if len(p.initialLimitTunnelProtocols) > 0 && 1449 p.initialLimitTunnelProtocolsCandidateCount > connectTunnelCount { 1450 1451 limitTunnelProtocols = p.initialLimitTunnelProtocols 1452 } 1453 1454 return serverEntry.GetSupportedProtocols( 1455 conditionallyEnabledComponents{}, 1456 p.useUpstreamProxy, 1457 limitTunnelProtocols, 1458 p.limitTunnelDialPortNumbers, 1459 p.limitQUICVersions, 1460 excludeIntensive) 1461 } 1462 1463 func (p *protocolSelectionConstraints) selectProtocol( 1464 connectTunnelCount int, 1465 excludeIntensive bool, 1466 serverEntry *protocol.ServerEntry) (string, bool) { 1467 1468 candidateProtocols := p.supportedProtocols(connectTunnelCount, excludeIntensive, serverEntry) 1469 1470 if len(candidateProtocols) == 0 { 1471 return "", false 1472 } 1473 1474 // Pick at random from the supported protocols. This ensures that we'll 1475 // eventually try all possible protocols. Depending on network 1476 // configuration, it may be the case that some protocol is only available 1477 // through multi-capability servers, and a simpler ranked preference of 1478 // protocols could lead to that protocol never being selected. 1479 1480 index := prng.Intn(len(candidateProtocols)) 1481 1482 return candidateProtocols[index], true 1483 1484 } 1485 1486 type candidateServerEntry struct { 1487 serverEntry *protocol.ServerEntry 1488 isServerAffinityCandidate bool 1489 adjustedEstablishStartTime time.Time 1490 } 1491 1492 // startEstablishing creates a pool of worker goroutines which will 1493 // attempt to establish tunnels to candidate servers. The candidates 1494 // are generated by another goroutine. 1495 func (controller *Controller) startEstablishing() { 1496 if controller.isEstablishing { 1497 return 1498 } 1499 NoticeInfo("start establishing") 1500 1501 // establishStartTime is used to calculate and report the client's tunnel 1502 // establishment duration. Establishment duration should include all 1503 // initialization in launchEstablishing and establishCandidateGenerator, 1504 // including any potentially long-running datastore iterations. 1505 establishStartTime := time.Now() 1506 1507 controller.concurrentEstablishTunnelsMutex.Lock() 1508 controller.establishConnectTunnelCount = 0 1509 controller.concurrentEstablishTunnels = 0 1510 controller.concurrentIntensiveEstablishTunnels = 0 1511 controller.peakConcurrentEstablishTunnels = 0 1512 controller.peakConcurrentIntensiveEstablishTunnels = 0 1513 controller.concurrentEstablishTunnelsMutex.Unlock() 1514 1515 DoGarbageCollection() 1516 emitMemoryMetrics() 1517 1518 // The establish context cancelFunc, controller.stopEstablish, is called in 1519 // controller.stopEstablishing. 1520 1521 controller.isEstablishing = true 1522 controller.establishStartTime = establishStartTime 1523 controller.establishCtx, controller.stopEstablish = context.WithCancel(controller.runCtx) 1524 controller.establishWaitGroup = new(sync.WaitGroup) 1525 controller.candidateServerEntries = make(chan *candidateServerEntry) 1526 1527 // The server affinity mechanism attempts to favor the previously 1528 // used server when reconnecting. This is beneficial for user 1529 // applications which expect consistency in user IP address (for 1530 // example, a web site which prompts for additional user 1531 // authentication when the IP address changes). 1532 // 1533 // Only the very first server, as determined by 1534 // datastore.PromoteServerEntry(), is the server affinity candidate. 1535 // Concurrent connections attempts to many servers are launched 1536 // without delay, in case the affinity server connection fails. 1537 // While the affinity server connection is outstanding, when any 1538 // other connection is established, there is a short grace period 1539 // delay before delivering the established tunnel; this allows some 1540 // time for the affinity server connection to succeed first. 1541 // When the affinity server connection fails, any other established 1542 // tunnel is registered without delay. 1543 // 1544 // Note: the establishTunnelWorker that receives the affinity 1545 // candidate is solely resonsible for closing 1546 // controller.serverAffinityDoneBroadcast. 1547 controller.serverAffinityDoneBroadcast = make(chan struct{}) 1548 1549 controller.establishWaitGroup.Add(1) 1550 go controller.launchEstablishing() 1551 } 1552 1553 func (controller *Controller) launchEstablishing() { 1554 1555 defer controller.establishWaitGroup.Done() 1556 1557 // Before starting the establish tunnel workers, get and apply 1558 // tactics, launching a tactics request if required. 1559 // 1560 // Wait only TacticsWaitPeriod for the tactics request to complete (or 1561 // fail) before proceeding with tunnel establishment, in case the tactics 1562 // request is blocked or takes very long to complete. 1563 // 1564 // An in-flight tactics request uses meek in round tripper mode, which 1565 // uses less resources than meek tunnel relay mode. For this reason, the 1566 // tactics request is not counted in concurrentIntensiveEstablishTunnels. 1567 // 1568 // TODO: HTTP/2 uses significantly more memory, so perhaps 1569 // concurrentIntensiveEstablishTunnels should be counted in that case. 1570 // 1571 // Any in-flight tactics request or pending retry will be 1572 // canceled when establishment is stopped. 1573 1574 if !controller.config.DisableTactics { 1575 1576 timeout := controller.config.GetParameters().Get().Duration( 1577 parameters.TacticsWaitPeriod) 1578 1579 tacticsDone := make(chan struct{}) 1580 tacticsWaitPeriod := time.NewTimer(timeout) 1581 defer tacticsWaitPeriod.Stop() 1582 1583 controller.establishWaitGroup.Add(1) 1584 go func() { 1585 defer controller.establishWaitGroup.Done() 1586 defer close(tacticsDone) 1587 GetTactics(controller.establishCtx, controller.config) 1588 }() 1589 1590 select { 1591 case <-tacticsDone: 1592 case <-tacticsWaitPeriod.C: 1593 } 1594 1595 tacticsWaitPeriod.Stop() 1596 1597 if controller.isStopEstablishing() { 1598 // This check isn't strictly required but avoids the overhead of launching 1599 // workers if establishment stopped while awaiting a tactics request. 1600 return 1601 } 1602 } 1603 1604 // Initial- and LimitTunnelProtocols may be set by tactics. 1605 // 1606 // These protocol limits are fixed once per establishment, for 1607 // consistent application of related probabilities (applied by 1608 // ParametersAccessor.TunnelProtocols). The 1609 // establishLimitTunnelProtocolsState field must be read-only after this 1610 // point, allowing concurrent reads by establishment workers. 1611 1612 p := controller.config.GetParameters().Get() 1613 1614 controller.protocolSelectionConstraints = &protocolSelectionConstraints{ 1615 useUpstreamProxy: controller.config.UseUpstreamProxy(), 1616 initialLimitTunnelProtocols: p.TunnelProtocols(parameters.InitialLimitTunnelProtocols), 1617 initialLimitTunnelProtocolsCandidateCount: p.Int(parameters.InitialLimitTunnelProtocolsCandidateCount), 1618 limitTunnelProtocols: p.TunnelProtocols(parameters.LimitTunnelProtocols), 1619 1620 limitTunnelDialPortNumbers: protocol.TunnelProtocolPortLists( 1621 p.TunnelProtocolPortLists(parameters.LimitTunnelDialPortNumbers)), 1622 1623 replayCandidateCount: p.Int(parameters.ReplayCandidateCount), 1624 } 1625 1626 // ConnectionWorkerPoolSize may be set by tactics. 1627 1628 workerPoolSize := p.Int(parameters.ConnectionWorkerPoolSize) 1629 1630 // When TargetServerEntry is used, override any worker pool size config or 1631 // tactic parameter and use a pool size of 1. The typical use case for 1632 // TargetServerEntry is to test a specific server with a single connection 1633 // attempt. Furthermore, too many concurrent attempts to connect to the 1634 // same server will trigger rate limiting. 1635 if controller.config.TargetServerEntry != "" { 1636 workerPoolSize = 1 1637 } 1638 1639 // TunnelPoolSize may be set by tactics, subject to local constraints. A pool 1640 // size of one is forced in packet tunnel mode or when using a 1641 // TargetServerEntry. The tunnel pool size is reduced when there are 1642 // insufficent known server entries, within the set region and protocol 1643 // constraints, to satisfy the target. 1644 // 1645 // Limitations, to simplify concurrent access to shared state: a ceiling of 1646 // MAX_TUNNEL_POOL_SIZE is enforced by setTunnelPoolSize; the tunnel pool 1647 // size target is not re-adjusted after an API handshake, even though the 1648 // handshake response may deliver new tactics, or prune server entries which 1649 // were potential candidates; nor is the target re-adjusted after fetching 1650 // new server entries during this establishment. 1651 1652 tunnelPoolSize := p.Int(parameters.TunnelPoolSize) 1653 if controller.config.PacketTunnelTunFileDescriptor > 0 || 1654 controller.config.TargetServerEntry != "" { 1655 tunnelPoolSize = 1 1656 } 1657 1658 p.Close() 1659 1660 // Trigger CandidateServers and AvailableEgressRegions notices. By default, 1661 // this is an asynchronous operation, as the underlying full server entry 1662 // list enumeration may be a slow operation. In certain cases, where 1663 // candidate counts are required up front, await the result before 1664 // proceeding. 1665 1666 awaitResponse := tunnelPoolSize > 1 || 1667 controller.protocolSelectionConstraints.initialLimitTunnelProtocolsCandidateCount > 0 1668 1669 // AvailableEgressRegions: after a fresh install, the outer client may not 1670 // have a list of regions to display; and LimitTunnelProtocols may reduce the 1671 // number of available regions. 1672 // 1673 // When the outer client receives NoticeAvailableEgressRegions and the 1674 // configured EgressRegion is not included in the region list, the outer 1675 // client _should_ stop tunnel-core and prompt the user to change the region 1676 // selection, as there are insufficient servers/capabilities to establish a 1677 // tunnel in the selected region. 1678 // 1679 // This report is delayed until after tactics are likely to be applied, 1680 // above; this avoids a ReportAvailableRegions reporting too many regions, 1681 // followed shortly by a ReportAvailableRegions reporting fewer regions. That 1682 // sequence could cause issues in the outer client UI. 1683 // 1684 // The reported regions are limited by protocolSelectionConstraints; in the 1685 // case where an initial limit is in place, only regions available for the 1686 // initial limit are reported. The initial phase will not complete if 1687 // EgressRegion is set such that there are no server entries with the 1688 // necessary protocol capabilities (either locally or from a remote server 1689 // list fetch). 1690 1691 // Concurrency note: controller.protocolSelectionConstraints may be 1692 // overwritten before serverEntriesReporter reads it, and so cannot be 1693 // accessed directly by serverEntriesReporter. 1694 reportRequest := &serverEntriesReportRequest{ 1695 constraints: controller.protocolSelectionConstraints, 1696 } 1697 1698 if awaitResponse { 1699 // Buffer size of 1 ensures the sender, serverEntryReporter, won't block on 1700 // sending the response in the case where launchEstablishing exits due to 1701 // stopping establishment. 1702 reportRequest.awaitResponse = make(chan *serverEntriesReportResponse, 1) 1703 } 1704 1705 controller.signalServerEntriesReporter(reportRequest) 1706 1707 if awaitResponse { 1708 1709 var reportResponse *serverEntriesReportResponse 1710 select { 1711 case reportResponse = <-reportRequest.awaitResponse: 1712 case <-controller.establishCtx.Done(): 1713 // The sender may be gone when shutting down, or may not send until after 1714 // stopping establishment. 1715 return 1716 } 1717 if reportResponse.err != nil { 1718 NoticeError("failed to report server entries: %v", 1719 errors.Trace(reportResponse.err)) 1720 controller.SignalComponentFailure() 1721 return 1722 } 1723 1724 // Make adjustments based on candidate counts. 1725 1726 if tunnelPoolSize > 1 { 1727 // Initial canidate count is ignored as count candidates will eventually 1728 // become available. 1729 if reportResponse.candidates < tunnelPoolSize { 1730 tunnelPoolSize = reportResponse.candidates 1731 } 1732 if tunnelPoolSize < 1 { 1733 tunnelPoolSize = 1 1734 } 1735 } 1736 controller.setTunnelPoolSize(tunnelPoolSize) 1737 1738 // If InitialLimitTunnelProtocols is configured but cannot be satisfied, 1739 // skip the initial phase in this establishment. This avoids spinning, 1740 // unable to connect, in this case. InitialLimitTunnelProtocols is 1741 // intended to prioritize certain protocols, but not strictly select them. 1742 // 1743 // The candidate count check ignores egress region selection. When an egress 1744 // region is selected, it's the responsibility of the outer client to react 1745 // to the following ReportAvailableRegions output and clear the user's 1746 // selected region to prevent spinning, unable to connect. The initial phase 1747 // is skipped only when InitialLimitTunnelProtocols cannot be satisfied 1748 // _regardless_ of region selection. 1749 // 1750 // We presume that, in practise, most clients will have embedded server 1751 // entries with capabilities for most protocols; and that clients will 1752 // often perform RSL checks. So clients should most often have the 1753 // necessary capabilities to satisfy InitialLimitTunnelProtocols. When 1754 // this check fails, RSL/OSL/upgrade checks are triggered in order to gain 1755 // new capabilities. 1756 // 1757 // LimitTunnelProtocols remains a hard limit, as using prohibited 1758 // protocols may have some bad effect, such as a firewall blocking all 1759 // traffic from a host. 1760 1761 if controller.protocolSelectionConstraints.initialLimitTunnelProtocolsCandidateCount > 0 { 1762 1763 if reportResponse.initialCandidatesAnyEgressRegion == 0 { 1764 NoticeWarning("skipping initial limit tunnel protocols") 1765 controller.protocolSelectionConstraints.initialLimitTunnelProtocolsCandidateCount = 0 1766 1767 // Since we were unable to satisfy the InitialLimitTunnelProtocols 1768 // tactic, trigger RSL, OSL, and upgrade fetches to potentially 1769 // gain new capabilities. 1770 controller.triggerFetches() 1771 } 1772 } 1773 } 1774 1775 for i := 0; i < workerPoolSize; i++ { 1776 controller.establishWaitGroup.Add(1) 1777 go controller.establishTunnelWorker() 1778 } 1779 1780 controller.establishWaitGroup.Add(1) 1781 go controller.establishCandidateGenerator() 1782 } 1783 1784 // stopEstablishing signals the establish goroutines to stop and waits 1785 // for the group to halt. 1786 func (controller *Controller) stopEstablishing() { 1787 if !controller.isEstablishing { 1788 return 1789 } 1790 NoticeInfo("stop establishing") 1791 controller.stopEstablish() 1792 // Note: establishCandidateGenerator closes controller.candidateServerEntries 1793 // (as it may be sending to that channel). 1794 controller.establishWaitGroup.Wait() 1795 NoticeInfo("stopped establishing") 1796 1797 controller.isEstablishing = false 1798 controller.establishStartTime = time.Time{} 1799 controller.establishCtx = nil 1800 controller.stopEstablish = nil 1801 controller.establishWaitGroup = nil 1802 controller.candidateServerEntries = nil 1803 controller.serverAffinityDoneBroadcast = nil 1804 1805 controller.concurrentEstablishTunnelsMutex.Lock() 1806 peakConcurrent := controller.peakConcurrentEstablishTunnels 1807 peakConcurrentIntensive := controller.peakConcurrentIntensiveEstablishTunnels 1808 controller.establishConnectTunnelCount = 0 1809 controller.concurrentEstablishTunnels = 0 1810 controller.concurrentIntensiveEstablishTunnels = 0 1811 controller.peakConcurrentEstablishTunnels = 0 1812 controller.peakConcurrentIntensiveEstablishTunnels = 0 1813 controller.concurrentEstablishTunnelsMutex.Unlock() 1814 NoticeInfo("peak concurrent establish tunnels: %d", peakConcurrent) 1815 NoticeInfo("peak concurrent resource intensive establish tunnels: %d", peakConcurrentIntensive) 1816 1817 emitMemoryMetrics() 1818 DoGarbageCollection() 1819 1820 // Record datastore metrics after establishment, the phase which generates 1821 // the bulk of all datastore transactions: iterating over server entries, 1822 // storing new server entries, etc. 1823 emitDatastoreMetrics() 1824 1825 // Similarly, establishment generates the bulk of domain resolves. 1826 emitDNSMetrics(controller.resolver) 1827 } 1828 1829 // establishCandidateGenerator populates the candidate queue with server entries 1830 // from the data store. Server entries are iterated in rank order, so that promoted 1831 // servers with higher rank are priority candidates. 1832 func (controller *Controller) establishCandidateGenerator() { 1833 defer controller.establishWaitGroup.Done() 1834 defer close(controller.candidateServerEntries) 1835 1836 // networkWaitDuration is the elapsed time spent waiting 1837 // for network connectivity. This duration will be excluded 1838 // from reported tunnel establishment duration. 1839 var totalNetworkWaitDuration time.Duration 1840 1841 applyServerAffinity, iterator, err := NewServerEntryIterator(controller.config) 1842 if err != nil { 1843 NoticeError("failed to iterate over candidates: %v", errors.Trace(err)) 1844 controller.SignalComponentFailure() 1845 return 1846 } 1847 defer iterator.Close() 1848 1849 // TODO: reconcile server affinity scheme with multi-tunnel mode 1850 if controller.getTunnelPoolSize() > 1 { 1851 applyServerAffinity = false 1852 } 1853 1854 isServerAffinityCandidate := true 1855 if !applyServerAffinity { 1856 isServerAffinityCandidate = false 1857 close(controller.serverAffinityDoneBroadcast) 1858 } 1859 1860 loop: 1861 // Repeat until stopped 1862 for { 1863 1864 // A "round" consists of a new shuffle of the server entries and attempted 1865 // connections up to the end of the server entry iterator, or 1866 // parameters.EstablishTunnelWorkTime elapsed. Time spent waiting for 1867 // network connectivity is excluded from round elapsed time. 1868 // 1869 // After a round, if parameters.EstablishTunnelWorkTime has elapsed in total 1870 // with no tunnel established, remote server list and upgrade checks are 1871 // triggered. 1872 // 1873 // A complete server entry iteration does not trigger fetches since it's 1874 // possible to have fewer than parameters.ConnectionWorkerPoolSize 1875 // candidates, in which case rounds end instantly due to the complete server 1876 // entry iteration. An exception is made for an empty server entry iterator; 1877 // in that case fetches may be triggered immediately. 1878 // 1879 // The number of server candidates may change during this loop, due to 1880 // remote server list fetches. Due to the performance impact, we will not 1881 // trigger additional, informational CandidateServer notices while in the 1882 // establishing loop. Clients typically re-establish often enough that we 1883 // will see the effect of the remote server list fetch in diagnostics. 1884 1885 roundStartTime := time.Now() 1886 var roundNetworkWaitDuration time.Duration 1887 1888 workTime := controller.config.GetParameters().Get().Duration( 1889 parameters.EstablishTunnelWorkTime) 1890 1891 candidateServerEntryCount := 0 1892 1893 // Send each iterator server entry to the establish workers 1894 for { 1895 1896 networkWaitStartTime := time.Now() 1897 if !WaitForNetworkConnectivity( 1898 controller.establishCtx, 1899 controller.config.NetworkConnectivityChecker) { 1900 break loop 1901 } 1902 networkWaitDuration := time.Since(networkWaitStartTime) 1903 roundNetworkWaitDuration += networkWaitDuration 1904 totalNetworkWaitDuration += networkWaitDuration 1905 1906 serverEntry, err := iterator.Next() 1907 if err != nil { 1908 NoticeError("failed to get next candidate: %v", errors.Trace(err)) 1909 controller.SignalComponentFailure() 1910 break loop 1911 } 1912 if serverEntry == nil { 1913 // Completed this iteration 1914 NoticeInfo("completed server entry iteration") 1915 break 1916 } 1917 1918 if controller.config.TargetApiProtocol == protocol.PSIPHON_SSH_API_PROTOCOL && 1919 !serverEntry.SupportsSSHAPIRequests() { 1920 continue 1921 } 1922 1923 candidateServerEntryCount += 1 1924 1925 // adjustedEstablishStartTime is establishStartTime shifted 1926 // to exclude time spent waiting for network connectivity. 1927 adjustedEstablishStartTime := controller.establishStartTime.Add( 1928 totalNetworkWaitDuration) 1929 1930 candidate := &candidateServerEntry{ 1931 serverEntry: serverEntry, 1932 isServerAffinityCandidate: isServerAffinityCandidate, 1933 adjustedEstablishStartTime: adjustedEstablishStartTime, 1934 } 1935 1936 wasServerAffinityCandidate := isServerAffinityCandidate 1937 1938 // Note: there must be only one server affinity candidate, as it 1939 // closes the serverAffinityDoneBroadcast channel. 1940 isServerAffinityCandidate = false 1941 1942 // TODO: here we could generate multiple candidates from the 1943 // server entry when there are many MeekFrontingAddresses. 1944 1945 select { 1946 case controller.candidateServerEntries <- candidate: 1947 case <-controller.establishCtx.Done(): 1948 break loop 1949 } 1950 1951 if time.Since(roundStartTime)-roundNetworkWaitDuration > workTime { 1952 // Start over, after a brief pause, with a new shuffle of the server 1953 // entries, and potentially some newly fetched server entries. 1954 break 1955 } 1956 1957 if wasServerAffinityCandidate { 1958 1959 // Don't start the next candidate until either the server affinity 1960 // candidate has completed (success or failure) or is still working 1961 // and the grace period has elapsed. 1962 1963 gracePeriod := controller.config.GetParameters().Get().Duration( 1964 parameters.EstablishTunnelServerAffinityGracePeriod) 1965 1966 if gracePeriod > 0 { 1967 timer := time.NewTimer(gracePeriod) 1968 select { 1969 case <-timer.C: 1970 case <-controller.serverAffinityDoneBroadcast: 1971 case <-controller.establishCtx.Done(): 1972 timer.Stop() 1973 break loop 1974 } 1975 timer.Stop() 1976 } 1977 } 1978 } 1979 1980 // Free up resources now, but don't reset until after the pause. 1981 iterator.Close() 1982 1983 // Trigger RSL, OSL, and upgrade checks after failing to establish a 1984 // tunnel within parameters.EstablishTunnelWorkTime, or if there are 1985 // no server entries present. 1986 // 1987 // While the trigger is made after each round, 1988 // parameter.FetchRemoteServerListStalePeriod will limit the actual 1989 // frequency of fetches. Continuing to trigger allows for very long running 1990 // establishments to perhaps eventually succeed. 1991 // 1992 // No fetches are triggered when TargetServerEntry is specified. In that 1993 // case, we're only trying to connect to a specific server entry. 1994 1995 if (candidateServerEntryCount == 0 || 1996 time.Since(controller.establishStartTime)-totalNetworkWaitDuration > workTime) && 1997 controller.config.TargetServerEntry == "" { 1998 1999 controller.triggerFetches() 2000 } 2001 2002 // After a complete iteration of candidate servers, pause before iterating again. 2003 // This helps avoid some busy wait loop conditions, and also allows some time for 2004 // network conditions to change. Also allows for fetch remote to complete, 2005 // in typical conditions (it isn't strictly necessary to wait for this, there will 2006 // be more rounds if required). 2007 2008 p := controller.config.GetParameters().Get() 2009 timeout := prng.JitterDuration( 2010 p.Duration(parameters.EstablishTunnelPausePeriod), 2011 p.Float(parameters.EstablishTunnelPausePeriodJitter)) 2012 p.Close() 2013 2014 timer := time.NewTimer(timeout) 2015 select { 2016 case <-timer.C: 2017 // Retry iterating 2018 case <-controller.establishCtx.Done(): 2019 timer.Stop() 2020 break loop 2021 } 2022 timer.Stop() 2023 2024 iterator.Reset() 2025 } 2026 } 2027 2028 // establishTunnelWorker pulls candidates from the candidate queue, establishes 2029 // a connection to the tunnel server, and delivers the connected tunnel to a channel. 2030 func (controller *Controller) establishTunnelWorker() { 2031 defer controller.establishWaitGroup.Done() 2032 loop: 2033 for candidateServerEntry := range controller.candidateServerEntries { 2034 2035 // Note: don't receive from candidateServerEntries and isStopEstablishing 2036 // in the same select, since we want to prioritize receiving the stop signal 2037 if controller.isStopEstablishing() { 2038 break loop 2039 } 2040 2041 // There may already be a tunnel to this candidate. If so, skip it. 2042 if controller.isActiveTunnelServerEntry(candidateServerEntry.serverEntry) { 2043 continue 2044 } 2045 2046 // TODO: we allow multiple, concurrent workers to attempt to connect to the 2047 // same server. This is not wasteful if the server supports several 2048 // different protocols, some of which may be blocked while others are not 2049 // blocked. Limiting protocols with [Initial]LimitTunnelProtocols may make 2050 // these multiple attempts redundent. Also, replay should be used only by 2051 // the first attempt. 2052 2053 // upstreamProxyErrorCallback will post NoticeUpstreamProxyError when the 2054 // tunnel dial fails due to an upstream proxy error. As the upstream proxy 2055 // is user configured, the error message may need to be relayed to the user. 2056 2057 // As the callback may be invoked after establishment is over (e.g., if an 2058 // initial dial isn't fully shutdown when ConnectTunnel returns; or a meek 2059 // underlying TCP connection re-dial) don't access these variables 2060 // directly. 2061 callbackCandidateServerEntry := candidateServerEntry 2062 callbackEstablishCtx := controller.establishCtx 2063 2064 upstreamProxyErrorCallback := func(err error) { 2065 2066 // Do not post the notice when overall establishment context is canceled or 2067 // timed-out: the upstream proxy connection error is likely a result of the 2068 // cancellation, and not a condition to be fixed by the user. In the case 2069 // of meek underlying TCP connection re-dials, this condition will always 2070 // be true; however in this case the initial dial succeeded with the 2071 // current upstream proxy settings, so any upstream proxy error is 2072 // transient. 2073 if callbackEstablishCtx.Err() != nil { 2074 return 2075 } 2076 2077 // Another class of non-fatal upstream proxy error arises from proxies 2078 // which limit permitted proxied ports. In this case, some tunnels may fail 2079 // due to dial port, while others may eventually succeed. To avoid this 2080 // class of errors, delay posting the notice. If the upstream proxy works, 2081 // _some_ tunnel should connect. If the upstream proxy configuration is 2082 // broken, the error should persist and eventually get posted. 2083 2084 p := controller.config.GetParameters().Get() 2085 workerPoolSize := p.Int(parameters.ConnectionWorkerPoolSize) 2086 minWaitDuration := p.Duration(parameters.UpstreamProxyErrorMinWaitDuration) 2087 maxWaitDuration := p.Duration(parameters.UpstreamProxyErrorMaxWaitDuration) 2088 p.Close() 2089 2090 controller.concurrentEstablishTunnelsMutex.Lock() 2091 establishConnectTunnelCount := controller.establishConnectTunnelCount 2092 controller.concurrentEstablishTunnelsMutex.Unlock() 2093 2094 // Delay UpstreamProxyErrorMinWaitDuration (excluding time spent waiting 2095 // for network connectivity) and then until either 2096 // UpstreamProxyErrorMaxWaitDuration has elapsed or, to post sooner if many 2097 // candidates are failing, at least workerPoolSize tunnel connection 2098 // attempts have completed. We infer that at least workerPoolSize 2099 // candidates have completed by checking that at least 2*workerPoolSize 2100 // candidates have started. 2101 2102 elapsedTime := time.Since( 2103 callbackCandidateServerEntry.adjustedEstablishStartTime) 2104 2105 if elapsedTime < minWaitDuration || 2106 (elapsedTime < maxWaitDuration && 2107 establishConnectTunnelCount < 2*workerPoolSize) { 2108 return 2109 } 2110 2111 NoticeUpstreamProxyError(err) 2112 } 2113 2114 // Select the tunnel protocol. The selection will be made at random 2115 // from protocols supported by the server entry, optionally limited by 2116 // LimitTunnelProtocols. 2117 // 2118 // When limiting concurrent resource intensive protocol connection 2119 // workers, and at the limit, do not select resource intensive 2120 // protocols since otherwise the candidate must be skipped. 2121 // 2122 // If at the limit and unabled to select a non-intensive protocol, 2123 // skip the candidate entirely and move on to the next. Since 2124 // candidates are shuffled it's likely that the next candidate is not 2125 // intensive. In this case, a StaggerConnectionWorkersMilliseconds 2126 // delay may still be incurred. 2127 2128 limitIntensiveConnectionWorkers := controller.config.GetParameters().Get().Int( 2129 parameters.LimitIntensiveConnectionWorkers) 2130 2131 controller.concurrentEstablishTunnelsMutex.Lock() 2132 2133 excludeIntensive := false 2134 if limitIntensiveConnectionWorkers > 0 && 2135 controller.concurrentIntensiveEstablishTunnels >= limitIntensiveConnectionWorkers { 2136 excludeIntensive = true 2137 } 2138 2139 canReplay := func(serverEntry *protocol.ServerEntry, replayProtocol string) bool { 2140 return controller.protocolSelectionConstraints.canReplay( 2141 controller.establishConnectTunnelCount, 2142 excludeIntensive, 2143 serverEntry, 2144 replayProtocol) 2145 } 2146 2147 selectProtocol := func(serverEntry *protocol.ServerEntry) (string, bool) { 2148 return controller.protocolSelectionConstraints.selectProtocol( 2149 controller.establishConnectTunnelCount, 2150 excludeIntensive, 2151 serverEntry) 2152 } 2153 2154 // MakeDialParameters may return a replay instance, if the server 2155 // entry has a previous, recent successful connection and 2156 // tactics/config has not changed. 2157 // 2158 // In the first round -- and later rounds, with some probability -- of 2159 // establishing, ServerEntryIterator will move potential replay candidates 2160 // to the front of the iterator after the random shuffle, which greatly 2161 // prioritizes previously successful servers for that round. 2162 // 2163 // As ServerEntryIterator does not unmarshal and validate replay 2164 // candidate dial parameters, some potential replay candidates may 2165 // have expired or otherwise ineligible dial parameters; in this case 2166 // the candidate proceeds without replay. 2167 // 2168 // The ReplayCandidateCount tactic determines how many candidates may use 2169 // replay. After ReplayCandidateCount candidates of any type, replay or no, 2170 // replay is skipped. If ReplayCandidateCount exceeds the intial round, 2171 // replay may still be performed but the iterator may no longer move 2172 // potential replay server entries to the front. When ReplayCandidateCount 2173 // is set to -1, unlimited candidates may use replay. 2174 2175 dialParams, err := MakeDialParameters( 2176 controller.config, 2177 upstreamProxyErrorCallback, 2178 canReplay, 2179 selectProtocol, 2180 candidateServerEntry.serverEntry, 2181 false, 2182 controller.establishConnectTunnelCount, 2183 int(atomic.LoadInt32(&controller.establishedTunnelsCount))) 2184 if dialParams == nil || err != nil { 2185 2186 controller.concurrentEstablishTunnelsMutex.Unlock() 2187 2188 // MakeDialParameters returns nil/nil when the server entry is to 2189 // be skipped. See MakeDialParameters for skip cases and skip 2190 // logging. Silently fail the candidate in this case. Otherwise, 2191 // emit error. 2192 if err != nil { 2193 NoticeInfo("failed to make dial parameters for %s: %v", 2194 candidateServerEntry.serverEntry.GetDiagnosticID(), 2195 errors.Trace(err)) 2196 } 2197 2198 // Unblock other candidates immediately when server affinity 2199 // candidate is skipped. 2200 if candidateServerEntry.isServerAffinityCandidate { 2201 close(controller.serverAffinityDoneBroadcast) 2202 } 2203 2204 continue 2205 } 2206 2207 // Increment establishConnectTunnelCount only after selectProtocol has 2208 // succeeded to ensure InitialLimitTunnelProtocolsCandidateCount 2209 // candidates use InitialLimitTunnelProtocols. 2210 establishConnectTunnelCount := controller.establishConnectTunnelCount 2211 controller.establishConnectTunnelCount += 1 2212 2213 isIntensive := protocol.TunnelProtocolIsResourceIntensive(dialParams.TunnelProtocol) 2214 2215 if isIntensive { 2216 controller.concurrentIntensiveEstablishTunnels += 1 2217 if controller.concurrentIntensiveEstablishTunnels > controller.peakConcurrentIntensiveEstablishTunnels { 2218 controller.peakConcurrentIntensiveEstablishTunnels = controller.concurrentIntensiveEstablishTunnels 2219 } 2220 } 2221 controller.concurrentEstablishTunnels += 1 2222 if controller.concurrentEstablishTunnels > controller.peakConcurrentEstablishTunnels { 2223 controller.peakConcurrentEstablishTunnels = controller.concurrentEstablishTunnels 2224 } 2225 2226 controller.concurrentEstablishTunnelsMutex.Unlock() 2227 2228 // Apply stagger only now that we're past MakeDialParameters and 2229 // protocol selection logic which may have caused the candidate to be 2230 // skipped. The stagger logic delays dialing, and we don't want to 2231 // incur that delay that when skipping. 2232 // 2233 // Locking staggerMutex serializes staggers, so that multiple workers 2234 // don't simply sleep in parallel. 2235 // 2236 // The stagger is applied when establishConnectTunnelCount > 0 -- that 2237 // is, for all but the first dial. 2238 2239 p := controller.config.GetParameters().Get() 2240 staggerPeriod := p.Duration(parameters.StaggerConnectionWorkersPeriod) 2241 staggerJitter := p.Float(parameters.StaggerConnectionWorkersJitter) 2242 p.Close() 2243 2244 if establishConnectTunnelCount > 0 && staggerPeriod != 0 { 2245 controller.staggerMutex.Lock() 2246 timer := time.NewTimer(prng.JitterDuration(staggerPeriod, staggerJitter)) 2247 select { 2248 case <-timer.C: 2249 case <-controller.establishCtx.Done(): 2250 } 2251 timer.Stop() 2252 controller.staggerMutex.Unlock() 2253 } 2254 2255 // ConnectTunnel will allocate significant memory, so first attempt to 2256 // reclaim as much as possible. 2257 DoGarbageCollection() 2258 2259 tunnel, err := ConnectTunnel( 2260 controller.establishCtx, 2261 controller.config, 2262 candidateServerEntry.adjustedEstablishStartTime, 2263 dialParams) 2264 2265 controller.concurrentEstablishTunnelsMutex.Lock() 2266 if isIntensive { 2267 controller.concurrentIntensiveEstablishTunnels -= 1 2268 } 2269 controller.concurrentEstablishTunnels -= 1 2270 controller.concurrentEstablishTunnelsMutex.Unlock() 2271 2272 // Periodically emit memory metrics during the establishment cycle. 2273 if !controller.isStopEstablishing() { 2274 emitMemoryMetrics() 2275 } 2276 2277 // Immediately reclaim memory allocated by the establishment. In the case 2278 // of failure, first clear the reference to the tunnel. In the case of 2279 // success, the garbage collection may still be effective as the initial 2280 // phases of some protocols involve significant memory allocation that 2281 // could now be reclaimed. 2282 if err != nil { 2283 tunnel = nil 2284 } 2285 DoGarbageCollection() 2286 2287 if err != nil { 2288 2289 // Unblock other candidates immediately when server affinity 2290 // candidate fails. 2291 if candidateServerEntry.isServerAffinityCandidate { 2292 close(controller.serverAffinityDoneBroadcast) 2293 } 2294 2295 // Before emitting error, check if establish interrupted, in which 2296 // case the error is noise. 2297 if controller.isStopEstablishing() { 2298 break loop 2299 } 2300 2301 NoticeInfo("failed to connect to %s: %v", 2302 candidateServerEntry.serverEntry.GetDiagnosticID(), 2303 errors.Trace(err)) 2304 2305 continue 2306 } 2307 2308 // Deliver connected tunnel. 2309 // Don't block. Assumes the receiver has a buffer large enough for 2310 // the number of desired tunnels. If there's no room, the tunnel must 2311 // not be required so it's discarded. 2312 select { 2313 case controller.connectedTunnels <- tunnel: 2314 default: 2315 controller.discardTunnel(tunnel) 2316 2317 // Clear the reference to this discarded tunnel and immediately run 2318 // a garbage collection to reclaim its memory. 2319 tunnel = nil 2320 DoGarbageCollection() 2321 } 2322 2323 // Unblock other candidates only after delivering when 2324 // server affinity candidate succeeds. 2325 if candidateServerEntry.isServerAffinityCandidate { 2326 close(controller.serverAffinityDoneBroadcast) 2327 } 2328 } 2329 } 2330 2331 func (controller *Controller) isStopEstablishing() bool { 2332 select { 2333 case <-controller.establishCtx.Done(): 2334 return true 2335 default: 2336 } 2337 return false 2338 }