github.com/astaguna/popon-core@v0.0.0-20231019235610-96e42d76a5ff/psiphon/controller.go (about) 1 /* 2 * Copyright (c) 2015, Psiphon Inc. 3 * All rights reserved. 4 * 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation, either version 3 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 // Package psiphon implements the core tunnel functionality of a Psiphon client. 21 // The main function is RunForever, which runs a Controller that obtains lists of 22 // servers, establishes tunnel connections, and runs local proxies through which 23 // tunneled traffic may be sent. 24 package psiphon 25 26 import ( 27 "context" 28 "fmt" 29 "math/rand" 30 "net" 31 "runtime" 32 "sync" 33 "sync/atomic" 34 "time" 35 36 "github.com/astaguna/popon-core/psiphon/common" 37 "github.com/astaguna/popon-core/psiphon/common/errors" 38 "github.com/astaguna/popon-core/psiphon/common/parameters" 39 "github.com/astaguna/popon-core/psiphon/common/prng" 40 "github.com/astaguna/popon-core/psiphon/common/protocol" 41 "github.com/astaguna/popon-core/psiphon/common/resolver" 42 "github.com/astaguna/popon-core/psiphon/common/tun" 43 lrucache "github.com/cognusion/go-cache-lru" 44 ) 45 46 // Controller is a tunnel lifecycle coordinator. It manages lists of servers to 47 // connect to; establishes and monitors tunnels; and runs local proxies which 48 // route traffic through the tunnels. 49 type Controller struct { 50 config *Config 51 runCtx context.Context 52 stopRunning context.CancelFunc 53 runWaitGroup *sync.WaitGroup 54 connectedTunnels chan *Tunnel 55 failedTunnels chan *Tunnel 56 tunnelMutex sync.Mutex 57 establishedOnce bool 58 tunnelPoolSize int 59 tunnels []*Tunnel 60 nextTunnel int 61 isEstablishing bool 62 establishStartTime time.Time 63 protocolSelectionConstraints *protocolSelectionConstraints 64 concurrentEstablishTunnelsMutex sync.Mutex 65 establishConnectTunnelCount int 66 concurrentEstablishTunnels int 67 concurrentIntensiveEstablishTunnels int 68 peakConcurrentEstablishTunnels int 69 peakConcurrentIntensiveEstablishTunnels int 70 establishCtx context.Context 71 stopEstablish context.CancelFunc 72 establishWaitGroup *sync.WaitGroup 73 establishedTunnelsCount int32 74 candidateServerEntries chan *candidateServerEntry 75 untunneledDialConfig *DialConfig 76 untunneledSplitTunnelClassifications *lrucache.Cache 77 splitTunnelClassificationTTL time.Duration 78 splitTunnelClassificationMaxEntries int 79 signalFetchCommonRemoteServerList chan struct{} 80 signalFetchObfuscatedServerLists chan struct{} 81 signalDownloadUpgrade chan string 82 signalReportServerEntries chan *serverEntriesReportRequest 83 signalReportConnected chan struct{} 84 signalRestartEstablishing chan struct{} 85 serverAffinityDoneBroadcast chan struct{} 86 packetTunnelClient *tun.Client 87 packetTunnelTransport *PacketTunnelTransport 88 staggerMutex sync.Mutex 89 resolver *resolver.Resolver 90 } 91 92 // NewController initializes a new controller. 93 func NewController(config *Config) (controller *Controller, err error) { 94 95 if !config.IsCommitted() { 96 return nil, errors.TraceNew("uncommitted config") 97 } 98 99 // Needed by regen, at least 100 rand.Seed(int64(time.Now().Nanosecond())) 101 102 // The session ID for the Psiphon server API is used across all 103 // tunnels established by the controller. 104 NoticeSessionId(config.SessionID) 105 106 // Attempt to apply any valid, local stored tactics. The pre-done context 107 // ensures no tactics request is attempted now. 108 doneContext, cancelFunc := context.WithCancel(context.Background()) 109 cancelFunc() 110 GetTactics(doneContext, config) 111 112 p := config.GetParameters().Get() 113 splitTunnelClassificationTTL := 114 p.Duration(parameters.SplitTunnelClassificationTTL) 115 splitTunnelClassificationMaxEntries := 116 p.Int(parameters.SplitTunnelClassificationMaxEntries) 117 118 controller = &Controller{ 119 config: config, 120 runWaitGroup: new(sync.WaitGroup), 121 // connectedTunnels and failedTunnels buffer sizes are large enough to 122 // receive full pools of tunnels without blocking. Senders should not block. 123 connectedTunnels: make(chan *Tunnel, MAX_TUNNEL_POOL_SIZE), 124 failedTunnels: make(chan *Tunnel, MAX_TUNNEL_POOL_SIZE), 125 tunnelPoolSize: TUNNEL_POOL_SIZE, 126 tunnels: make([]*Tunnel, 0), 127 establishedOnce: false, 128 isEstablishing: false, 129 130 untunneledSplitTunnelClassifications: lrucache.NewWithLRU( 131 splitTunnelClassificationTTL, 132 1*time.Minute, 133 splitTunnelClassificationMaxEntries), 134 135 // TODO: Add a buffer of 1 so we don't miss a signal while receiver is 136 // starting? Trade-off is potential back-to-back fetch remotes. As-is, 137 // establish will eventually signal another fetch remote. 138 signalFetchCommonRemoteServerList: make(chan struct{}), 139 signalFetchObfuscatedServerLists: make(chan struct{}), 140 signalDownloadUpgrade: make(chan string), 141 signalReportConnected: make(chan struct{}), 142 143 // Using a buffer of 1 to ensure there's no race between the first signal 144 // sent and a channel receiver initializing; a side effect is that this 145 // allows 1 additional scan to enqueue while a scan is in progress, possibly 146 // resulting in one unnecessary scan. 147 signalReportServerEntries: make(chan *serverEntriesReportRequest, 1), 148 149 // signalRestartEstablishing has a buffer of 1 to ensure sending the 150 // signal doesn't block and receiving won't miss a signal. 151 signalRestartEstablishing: make(chan struct{}, 1), 152 } 153 154 // Initialize untunneledDialConfig, used by untunneled dials including 155 // remote server list and upgrade downloads. 156 controller.untunneledDialConfig = &DialConfig{ 157 UpstreamProxyURL: controller.config.UpstreamProxyURL, 158 CustomHeaders: controller.config.CustomHeaders, 159 DeviceBinder: controller.config.deviceBinder, 160 IPv6Synthesizer: controller.config.IPv6Synthesizer, 161 ResolveIP: func(ctx context.Context, hostname string) ([]net.IP, error) { 162 // Note: when domain fronting would be used for untunneled dials a 163 // copy of untunneledDialConfig should be used instead, which 164 // redefines ResolveIP such that the corresponding fronting 165 // provider ID is passed into UntunneledResolveIP to enable the use 166 // of pre-resolved IPs. 167 IPs, err := UntunneledResolveIP( 168 ctx, controller.config, controller.resolver, hostname, "") 169 if err != nil { 170 return nil, errors.Trace(err) 171 } 172 return IPs, nil 173 }, 174 TrustedCACertificatesFilename: controller.config.TrustedCACertificatesFilename, 175 } 176 177 if config.PacketTunnelTunFileDescriptor > 0 { 178 179 // Run a packet tunnel client. The lifetime of the tun.Client is the 180 // lifetime of the Controller, so it exists across tunnel establishments 181 // and reestablishments. The PacketTunnelTransport provides a layer 182 // that presents a continuosuly existing transport to the tun.Client; 183 // it's set to use new SSH channels after new SSH tunnel establishes. 184 185 packetTunnelTransport := NewPacketTunnelTransport() 186 187 packetTunnelClient, err := tun.NewClient(&tun.ClientConfig{ 188 Logger: NoticeCommonLogger(), 189 TunFileDescriptor: config.PacketTunnelTunFileDescriptor, 190 TransparentDNSIPv4Address: config.PacketTunnelTransparentDNSIPv4Address, 191 TransparentDNSIPv6Address: config.PacketTunnelTransparentDNSIPv6Address, 192 Transport: packetTunnelTransport, 193 }) 194 if err != nil { 195 return nil, errors.Trace(err) 196 } 197 198 controller.packetTunnelClient = packetTunnelClient 199 controller.packetTunnelTransport = packetTunnelTransport 200 } 201 202 return controller, nil 203 } 204 205 // Run executes the controller. Run exits if a controller 206 // component fails or the parent context is canceled. 207 func (controller *Controller) Run(ctx context.Context) { 208 209 if controller.config.LimitCPUThreads { 210 runtime.GOMAXPROCS(1) 211 } 212 213 pprofRun() 214 215 // Ensure fresh repetitive notice state for each run, so the 216 // client will always get an AvailableEgressRegions notice, 217 // an initial instance of any repetitive error notice, etc. 218 ResetRepetitiveNotices() 219 220 runCtx, stopRunning := context.WithCancel(ctx) 221 defer stopRunning() 222 223 controller.runCtx = runCtx 224 controller.stopRunning = stopRunning 225 226 // Start components 227 228 // Initialize a single resolver to be used by all dials. Sharing a single 229 // resolver ensures cached results are shared, and that network state 230 // query overhead is amortized over all dials. Multiple dials can resolve 231 // domain concurrently. 232 // 233 // config.SetResolver makes this resolver available to MakeDialParameters. 234 controller.resolver = NewResolver(controller.config, true) 235 defer controller.resolver.Stop() 236 controller.config.SetResolver(controller.resolver) 237 238 // TODO: IPv6 support 239 var listenIP string 240 if controller.config.ListenInterface == "" { 241 listenIP = "127.0.0.1" 242 } else if controller.config.ListenInterface == "any" { 243 listenIP = "0.0.0.0" 244 } else { 245 IPv4Address, _, err := common.GetInterfaceIPAddresses(controller.config.ListenInterface) 246 if err == nil && IPv4Address == nil { 247 err = fmt.Errorf("no IPv4 address for interface %s", controller.config.ListenInterface) 248 } 249 if err != nil { 250 NoticeError("error getting listener IP: %v", errors.Trace(err)) 251 return 252 } 253 listenIP = IPv4Address.String() 254 } 255 256 if !controller.config.DisableLocalSocksProxy { 257 socksProxy, err := NewSocksProxy(controller.config, controller, listenIP) 258 if err != nil { 259 NoticeError("error initializing local SOCKS proxy: %v", errors.Trace(err)) 260 return 261 } 262 defer socksProxy.Close() 263 } 264 265 if !controller.config.DisableLocalHTTPProxy { 266 httpProxy, err := NewHttpProxy(controller.config, controller, listenIP) 267 if err != nil { 268 NoticeError("error initializing local HTTP proxy: %v", errors.Trace(err)) 269 return 270 } 271 defer httpProxy.Close() 272 } 273 274 if !controller.config.DisableRemoteServerListFetcher { 275 276 if controller.config.RemoteServerListURLs != nil { 277 controller.runWaitGroup.Add(1) 278 go controller.remoteServerListFetcher( 279 "common", 280 FetchCommonRemoteServerList, 281 controller.signalFetchCommonRemoteServerList) 282 } 283 284 if controller.config.ObfuscatedServerListRootURLs != nil { 285 controller.runWaitGroup.Add(1) 286 go controller.remoteServerListFetcher( 287 "obfuscated", 288 FetchObfuscatedServerLists, 289 controller.signalFetchObfuscatedServerLists) 290 } 291 } 292 293 if controller.config.UpgradeDownloadURLs != nil { 294 controller.runWaitGroup.Add(1) 295 go controller.upgradeDownloader() 296 } 297 298 controller.runWaitGroup.Add(1) 299 go controller.serverEntriesReporter() 300 301 controller.runWaitGroup.Add(1) 302 go controller.connectedReporter() 303 304 controller.runWaitGroup.Add(1) 305 go controller.establishTunnelWatcher() 306 307 controller.runWaitGroup.Add(1) 308 go controller.runTunnels() 309 310 if controller.packetTunnelClient != nil { 311 controller.packetTunnelClient.Start() 312 } 313 314 // Wait while running 315 316 <-controller.runCtx.Done() 317 NoticeInfo("controller stopped") 318 319 if controller.packetTunnelClient != nil { 320 controller.packetTunnelClient.Stop() 321 } 322 323 // All workers -- runTunnels, establishment workers, and auxilliary 324 // workers such as fetch remote server list and untunneled uprade 325 // download -- operate with the controller run context and will all 326 // be interrupted when the run context is done. 327 328 controller.runWaitGroup.Wait() 329 330 NoticeInfo("exiting controller") 331 332 NoticeExiting() 333 } 334 335 // SignalComponentFailure notifies the controller that an associated component has failed. 336 // This will terminate the controller. 337 func (controller *Controller) SignalComponentFailure() { 338 NoticeWarning("controller shutdown due to component failure") 339 controller.stopRunning() 340 } 341 342 // SetDynamicConfig overrides the sponsor ID and authorizations fields of the 343 // Controller config with the input values. The new values will be used in the 344 // next tunnel connection. 345 func (controller *Controller) SetDynamicConfig(sponsorID string, authorizations []string) { 346 controller.config.SetDynamicConfig(sponsorID, authorizations) 347 } 348 349 // TerminateNextActiveTunnel terminates the active tunnel, which will initiate 350 // establishment of a new tunnel. 351 func (controller *Controller) TerminateNextActiveTunnel() { 352 tunnel := controller.getNextActiveTunnel() 353 if tunnel != nil { 354 controller.SignalTunnelFailure(tunnel) 355 NoticeInfo("terminated tunnel: %s", tunnel.dialParams.ServerEntry.GetDiagnosticID()) 356 } 357 } 358 359 // ExportExchangePayload creates a payload for client-to-client server 360 // connection info exchange. See the comment for psiphon.ExportExchangePayload 361 // for more details. 362 func (controller *Controller) ExportExchangePayload() string { 363 return ExportExchangePayload(controller.config) 364 } 365 366 // ImportExchangePayload imports a payload generated by ExportExchangePayload. 367 // See the comment for psiphon.ImportExchangePayload for more details about 368 // the import. 369 // 370 // When the import is successful, a signal is set to trigger a restart any 371 // establishment in progress. This will cause the newly imported server entry 372 // to be prioritized, which it otherwise would not be in later establishment 373 // rounds. The establishment process continues after ImportExchangePayload 374 // returns. 375 // 376 // If the client already has a connected tunnel, or a tunnel connection is 377 // established concurrently with the import, the signal has no effect as the 378 // overall goal is establish _any_ connection. 379 func (controller *Controller) ImportExchangePayload(payload string) bool { 380 381 // Race condition: if a new tunnel connection is established concurrently 382 // with the import, either that tunnel's server entry of the imported server 383 // entry may end up as the affinity server. 384 385 ok := ImportExchangePayload(controller.config, payload) 386 if !ok { 387 return false 388 } 389 390 select { 391 case controller.signalRestartEstablishing <- struct{}{}: 392 default: 393 } 394 395 return true 396 } 397 398 // remoteServerListFetcher fetches an out-of-band list of server entries 399 // for more tunnel candidates. It fetches when signalled, with retries 400 // on failure. 401 func (controller *Controller) remoteServerListFetcher( 402 name string, 403 fetcher RemoteServerListFetcher, 404 signal <-chan struct{}) { 405 406 defer controller.runWaitGroup.Done() 407 408 var lastFetchTime time.Time 409 410 fetcherLoop: 411 for { 412 // Wait for a signal before fetching 413 select { 414 case <-signal: 415 case <-controller.runCtx.Done(): 416 break fetcherLoop 417 } 418 419 // Skip fetch entirely (i.e., send no request at all, even when ETag would save 420 // on response size) when a recent fetch was successful 421 422 stalePeriod := controller.config.GetParameters().Get().Duration( 423 parameters.FetchRemoteServerListStalePeriod) 424 425 if !lastFetchTime.IsZero() && 426 lastFetchTime.Add(stalePeriod).After(time.Now()) { 427 continue 428 } 429 430 retryLoop: 431 for attempt := 0; ; attempt++ { 432 // Don't attempt to fetch while there is no network connectivity, 433 // to avoid alert notice noise. 434 if !WaitForNetworkConnectivity( 435 controller.runCtx, 436 controller.config.NetworkConnectivityChecker) { 437 break fetcherLoop 438 } 439 440 // Pick any active tunnel and make the next fetch attempt. If there's 441 // no active tunnel, the untunneledDialConfig will be used. 442 tunnel := controller.getNextActiveTunnel() 443 444 err := fetcher( 445 controller.runCtx, 446 controller.config, 447 attempt, 448 tunnel, 449 controller.untunneledDialConfig) 450 451 if err == nil { 452 lastFetchTime = time.Now() 453 break retryLoop 454 } 455 456 NoticeWarning("failed to fetch %s remote server list: %v", 457 name, errors.Trace(err)) 458 459 retryPeriod := controller.config.GetParameters().Get().Duration( 460 parameters.FetchRemoteServerListRetryPeriod) 461 462 timer := time.NewTimer(retryPeriod) 463 select { 464 case <-timer.C: 465 case <-controller.runCtx.Done(): 466 timer.Stop() 467 break fetcherLoop 468 } 469 } 470 } 471 472 NoticeInfo("exiting %s remote server list fetcher", name) 473 } 474 475 // upgradeDownloader makes periodic attempts to complete a client upgrade 476 // download. DownloadUpgrade() is resumable, so each attempt has potential for 477 // getting closer to completion, even in conditions where the download or 478 // tunnel is repeatedly interrupted. 479 // An upgrade download is triggered by either a handshake response indicating 480 // that a new version is available; or after failing to connect, in which case 481 // it's useful to check, out-of-band, for an upgrade with new circumvention 482 // capabilities. 483 // Once the download operation completes successfully, the downloader exits 484 // and is not run again: either there is not a newer version, or the upgrade 485 // has been downloaded and is ready to be applied. 486 // We're assuming that the upgrade will be applied and the entire system 487 // restarted before another upgrade is to be downloaded. 488 // 489 // TODO: refactor upgrade downloader and remote server list fetcher to use 490 // common code (including the resumable download routines). 491 func (controller *Controller) upgradeDownloader() { 492 defer controller.runWaitGroup.Done() 493 494 var lastDownloadTime time.Time 495 496 downloadLoop: 497 for { 498 // Wait for a signal before downloading 499 var handshakeVersion string 500 select { 501 case handshakeVersion = <-controller.signalDownloadUpgrade: 502 case <-controller.runCtx.Done(): 503 break downloadLoop 504 } 505 506 stalePeriod := controller.config.GetParameters().Get().Duration( 507 parameters.FetchUpgradeStalePeriod) 508 509 // Unless handshake is explicitly advertizing a new version, skip 510 // checking entirely when a recent download was successful. 511 if handshakeVersion == "" && 512 !lastDownloadTime.IsZero() && 513 lastDownloadTime.Add(stalePeriod).After(time.Now()) { 514 continue 515 } 516 517 retryLoop: 518 for attempt := 0; ; attempt++ { 519 // Don't attempt to download while there is no network connectivity, 520 // to avoid alert notice noise. 521 if !WaitForNetworkConnectivity( 522 controller.runCtx, 523 controller.config.NetworkConnectivityChecker) { 524 break downloadLoop 525 } 526 527 // Pick any active tunnel and make the next download attempt. If there's 528 // no active tunnel, the untunneledDialConfig will be used. 529 tunnel := controller.getNextActiveTunnel() 530 531 err := DownloadUpgrade( 532 controller.runCtx, 533 controller.config, 534 attempt, 535 handshakeVersion, 536 tunnel, 537 controller.untunneledDialConfig) 538 539 if err == nil { 540 lastDownloadTime = time.Now() 541 break retryLoop 542 } 543 544 NoticeWarning("failed to download upgrade: %v", errors.Trace(err)) 545 546 timeout := controller.config.GetParameters().Get().Duration( 547 parameters.FetchUpgradeRetryPeriod) 548 549 timer := time.NewTimer(timeout) 550 select { 551 case <-timer.C: 552 case <-controller.runCtx.Done(): 553 timer.Stop() 554 break downloadLoop 555 } 556 } 557 } 558 559 NoticeInfo("exiting upgrade downloader") 560 } 561 562 type serverEntriesReportRequest struct { 563 constraints *protocolSelectionConstraints 564 awaitResponse chan *serverEntriesReportResponse 565 } 566 567 type serverEntriesReportResponse struct { 568 err error 569 candidates int 570 initialCandidates int 571 initialCandidatesAnyEgressRegion int 572 availableEgressRegions []string 573 } 574 575 // serverEntriesReporter performs scans over all server entries to report on 576 // available tunnel candidates, subject to protocol selection constraints, and 577 // available egress regions. 578 // 579 // Because scans may be slow, depending on the client device and server entry 580 // list size, serverEntriesReporter is used to perform asychronous, background 581 // operations that would otherwise block establishment. This includes emitting 582 // diagnotic notices that are informational (CandidateServers) or which do not 583 // need to emit before establishment starts (AvailableEgressRegions). 584 // 585 // serverEntriesReporter also serves to combine these scans, which would 586 // otherwise be logically independent, due to the performance impact of scans. 587 // 588 // The underlying datastore implementation _may_ block write transactions 589 // while there are open read transactions. For example, bolt write 590 // transactions which need to re-map the data file (when the datastore grows) 591 // will block on open read transactions. In these scenarios, a slow scan will 592 // still block other operations. 593 // 594 // serverEntriesReporter runs beyond the establishment phase, since it's 595 // important for notices such as AvailableEgressRegions to eventually emit 596 // even if already established. serverEntriesReporter scans are cancellable, 597 // so controller shutdown is not blocked by slow scans. 598 // 599 // In some special cases, establishment cannot begin without candidate counts 600 // up front. In these cases only, the request contains a non-nil 601 // awaitResponse, a channel which is used by the requester to block until the 602 // scan is complete and the candidate counts are available. 603 func (controller *Controller) serverEntriesReporter() { 604 defer controller.runWaitGroup.Done() 605 606 loop: 607 for { 608 609 var request *serverEntriesReportRequest 610 611 select { 612 case request = <-controller.signalReportServerEntries: 613 case <-controller.runCtx.Done(): 614 break loop 615 } 616 617 egressRegion := controller.config.EgressRegion 618 constraints := request.constraints 619 620 var response serverEntriesReportResponse 621 622 regions := make(map[string]bool) 623 624 callback := func(serverEntry *protocol.ServerEntry) bool { 625 626 // In establishment, excludeIntensive depends on what set of protocols are 627 // already being dialed. For these reports, don't exclude intensive 628 // protocols as any intensive candidate can always be an available 629 // candidate at some point. 630 excludeIntensive := false 631 632 isInitialCandidate := constraints.isInitialCandidate(excludeIntensive, serverEntry) 633 isCandidate := constraints.isCandidate(excludeIntensive, serverEntry) 634 635 if isInitialCandidate { 636 response.initialCandidatesAnyEgressRegion += 1 637 } 638 639 if egressRegion == "" || serverEntry.Region == egressRegion { 640 if isInitialCandidate { 641 response.initialCandidates += 1 642 } 643 if isCandidate { 644 response.candidates += 1 645 } 646 } 647 648 isAvailable := isCandidate 649 if constraints.hasInitialProtocols() { 650 // Available egress regions is subject to an initial limit constraint, if 651 // present: see AvailableEgressRegions comment in launchEstablishing. 652 isAvailable = isInitialCandidate 653 } 654 655 if isAvailable { 656 // Ignore server entries with no region field. 657 if serverEntry.Region != "" { 658 regions[serverEntry.Region] = true 659 } 660 } 661 662 select { 663 case <-controller.runCtx.Done(): 664 // Don't block controller shutdown: cancel the scan. 665 return false 666 default: 667 return true 668 } 669 } 670 671 startTime := time.Now() 672 673 response.err = ScanServerEntries(callback) 674 675 // Report this duration in CandidateServers as an indication of datastore 676 // performance. 677 duration := time.Since(startTime) 678 679 response.availableEgressRegions = make([]string, 0, len(regions)) 680 for region := range regions { 681 response.availableEgressRegions = append(response.availableEgressRegions, region) 682 } 683 684 if response.err != nil { 685 686 // For diagnostics, we'll post this even when cancelled due to shutdown. 687 NoticeWarning("ScanServerEntries failed: %v", errors.Trace(response.err)) 688 689 // Continue and send error reponse. Clear any partial data to avoid 690 // misuse. 691 response.candidates = 0 692 response.initialCandidates = 0 693 response.initialCandidatesAnyEgressRegion = 0 694 response.availableEgressRegions = []string{} 695 } 696 697 if request.awaitResponse != nil { 698 select { 699 case request.awaitResponse <- &response: 700 case <-controller.runCtx.Done(): 701 // The receiver may be gone when shutting down. 702 } 703 } 704 705 if response.err == nil { 706 707 NoticeCandidateServers( 708 controller.config.EgressRegion, 709 controller.protocolSelectionConstraints, 710 response.initialCandidates, 711 response.candidates, 712 duration) 713 714 NoticeAvailableEgressRegions( 715 response.availableEgressRegions) 716 } 717 } 718 719 NoticeInfo("exiting server entries reporter") 720 } 721 722 // signalServerEntriesReporter triggers a new server entry report. Set 723 // request.awaitResponse to obtain the report output. When awaitResponse is 724 // set, signalServerEntriesReporter blocks until the reporter receives the 725 // request, guaranteeing the new report runs. Otherwise, the report is 726 // considered to be informational and may or may not run, depending on whether 727 // another run is already in progress. 728 func (controller *Controller) signalServerEntriesReporter(request *serverEntriesReportRequest) { 729 730 if request.awaitResponse == nil { 731 select { 732 case controller.signalReportServerEntries <- request: 733 default: 734 } 735 } else { 736 controller.signalReportServerEntries <- request 737 } 738 } 739 740 // connectedReporter sends periodic "connected" requests to the Psiphon API. 741 // These requests are for server-side unique user stats calculation. See the 742 // comment in DoConnectedRequest for a description of the request mechanism. 743 // 744 // To correctly count daily unique users, only one connected request is made 745 // across all simultaneous multi-tunnels; and the connected request is 746 // repeated every 24h. 747 // 748 // The signalReportConnected mechanism is used to trigger a connected request 749 // immediately after a reconnect. While strictly only one connected request 750 // per 24h is required in order to count daily unique users, the connected 751 // request also delivers the establishment duration metric (which includes 752 // time elapsed performing the handshake request) and additional fragmentation 753 // metrics; these metrics are measured for each tunnel. 754 func (controller *Controller) connectedReporter() { 755 defer controller.runWaitGroup.Done() 756 757 // session is nil when DisableApi is set 758 if controller.config.DisableApi { 759 return 760 } 761 762 select { 763 case <-controller.signalReportConnected: 764 // Make the initial connected request 765 case <-controller.runCtx.Done(): 766 return 767 } 768 769 loop: 770 for { 771 772 // Pick any active tunnel and make the next connected request. No error is 773 // logged if there's no active tunnel, as that's not an unexpected 774 // condition. 775 reported := false 776 tunnel := controller.getNextActiveTunnel() 777 if tunnel != nil { 778 err := tunnel.serverContext.DoConnectedRequest() 779 if err == nil { 780 reported = true 781 } else { 782 NoticeWarning("failed to make connected request: %v", 783 errors.Trace(err)) 784 } 785 } 786 787 // Schedule the next connected request and wait. This duration is not a 788 // dynamic ClientParameter as the daily unique user stats logic specifically 789 // requires a "connected" request no more or less often than every 24h. 790 var duration time.Duration 791 if reported { 792 duration = 24 * time.Hour 793 } else { 794 duration = controller.config.GetParameters().Get().Duration( 795 parameters.PsiphonAPIConnectedRequestRetryPeriod) 796 } 797 timer := time.NewTimer(duration) 798 doBreak := false 799 select { 800 case <-controller.signalReportConnected: 801 case <-timer.C: 802 // Make another connected request 803 case <-controller.runCtx.Done(): 804 doBreak = true 805 } 806 timer.Stop() 807 if doBreak { 808 break loop 809 } 810 } 811 812 NoticeInfo("exiting connected reporter") 813 } 814 815 func (controller *Controller) signalConnectedReporter() { 816 817 // session is nil when DisableApi is set 818 if controller.config.DisableApi { 819 return 820 } 821 822 select { 823 case controller.signalReportConnected <- struct{}{}: 824 default: 825 } 826 } 827 828 // establishTunnelWatcher terminates the controller if a tunnel 829 // has not been established in the configured time period. This 830 // is regardless of how many tunnels are presently active -- meaning 831 // that if an active tunnel was established and lost the controller 832 // is left running (to re-establish). 833 func (controller *Controller) establishTunnelWatcher() { 834 defer controller.runWaitGroup.Done() 835 836 timeout := controller.config.GetParameters().Get().Duration( 837 parameters.EstablishTunnelTimeout) 838 839 if timeout > 0 { 840 timer := time.NewTimer(timeout) 841 defer timer.Stop() 842 843 select { 844 case <-timer.C: 845 if !controller.hasEstablishedOnce() { 846 NoticeEstablishTunnelTimeout(timeout) 847 controller.SignalComponentFailure() 848 } 849 case <-controller.runCtx.Done(): 850 } 851 } 852 853 NoticeInfo("exiting establish tunnel watcher") 854 } 855 856 // runTunnels is the controller tunnel management main loop. It starts and stops 857 // establishing tunnels based on the target tunnel pool size and the current size 858 // of the pool. Tunnels are established asynchronously using worker goroutines. 859 // 860 // When there are no server entries for the target region/protocol, the 861 // establishCandidateGenerator will yield no candidates and wait before 862 // trying again. In the meantime, a remote server entry fetch may supply 863 // valid candidates. 864 // 865 // When a tunnel is established, it's added to the active pool. The tunnel's 866 // operateTunnel goroutine monitors the tunnel. 867 // 868 // When a tunnel fails, it's removed from the pool and the establish process is 869 // restarted to fill the pool. 870 func (controller *Controller) runTunnels() { 871 defer controller.runWaitGroup.Done() 872 873 // Start running 874 875 controller.startEstablishing() 876 loop: 877 for { 878 select { 879 880 case <-controller.signalRestartEstablishing: 881 882 // signalRestartEstablishing restarts any establishment in progress. One 883 // use case for this is to prioritize a newly imported, exchanged server 884 // entry, which will be in the affinity position. 885 // 886 // It's possible for another connection to establish concurrent to signalling; 887 // since the overall goal remains to establish _any_ connection, we accept that 888 // in some cases the exchanged server entry may not get used. 889 890 if controller.isEstablishing { 891 controller.stopEstablishing() 892 controller.startEstablishing() 893 } 894 895 case failedTunnel := <-controller.failedTunnels: 896 NoticeWarning("tunnel failed: %s", failedTunnel.dialParams.ServerEntry.GetDiagnosticID()) 897 controller.terminateTunnel(failedTunnel) 898 899 // Clear the reference to this tunnel before calling startEstablishing, 900 // which will invoke a garbage collection. 901 failedTunnel = nil 902 903 // Concurrency note: only this goroutine may call startEstablishing/stopEstablishing, 904 // which reference controller.isEstablishing. 905 controller.startEstablishing() 906 907 case connectedTunnel := <-controller.connectedTunnels: 908 909 // Tunnel establishment has two phases: connection and activation. 910 // 911 // Connection is run concurrently by the establishTunnelWorkers, to minimize 912 // delay when it's not yet known which server and protocol will be available 913 // and unblocked. 914 // 915 // Activation is run serially, here, to minimize the overhead of making a 916 // handshake request and starting the operateTunnel management worker for a 917 // tunnel which may be discarded. 918 // 919 // When the active tunnel will complete establishment, establishment is 920 // stopped before activation. This interrupts all connecting tunnels and 921 // garbage collects their memory. The purpose is to minimize memory 922 // pressure when the handshake request is made. In the unlikely case that the 923 // handshake fails, establishment is restarted. 924 // 925 // Any delays in stopEstablishing will delay the handshake for the last 926 // active tunnel. 927 // 928 // In the typical case of tunnelPoolSize of 1, only a single handshake is 929 // performed and the homepages notices file, when used, will not be modifed 930 // after the NoticeTunnels(1) [i.e., connected] until NoticeTunnels(0) [i.e., 931 // disconnected]. For tunnelPoolSize > 1, serial handshakes only ensures that 932 // each set of emitted NoticeHomepages is contiguous. 933 934 active, outstanding := controller.numTunnels() 935 936 // discardTunnel will be true here when already fully established. 937 938 discardTunnel := (outstanding <= 0) 939 isFirstTunnel := (active == 0) 940 isLastTunnel := (outstanding == 1) 941 942 if !discardTunnel { 943 944 if isLastTunnel { 945 controller.stopEstablishing() 946 } 947 948 err := connectedTunnel.Activate(controller.runCtx, controller) 949 950 if err != nil { 951 NoticeWarning("failed to activate %s: %v", 952 connectedTunnel.dialParams.ServerEntry.GetDiagnosticID(), 953 errors.Trace(err)) 954 discardTunnel = true 955 } else { 956 // It's unlikely that registerTunnel will fail, since only this goroutine 957 // calls registerTunnel -- and after checking numTunnels; so failure is not 958 // expected. 959 if !controller.registerTunnel(connectedTunnel) { 960 NoticeWarning("failed to register %s: %v", 961 connectedTunnel.dialParams.ServerEntry.GetDiagnosticID(), 962 errors.Trace(err)) 963 discardTunnel = true 964 } 965 } 966 967 // May need to replace this tunnel 968 if isLastTunnel && discardTunnel { 969 controller.startEstablishing() 970 } 971 972 } 973 974 if discardTunnel { 975 controller.discardTunnel(connectedTunnel) 976 977 // Clear the reference to this discarded tunnel and immediately run 978 // a garbage collection to reclaim its memory. 979 connectedTunnel = nil 980 DoGarbageCollection() 981 982 // Skip the rest of this case 983 break 984 } 985 986 atomic.AddInt32(&controller.establishedTunnelsCount, 1) 987 988 NoticeActiveTunnel( 989 connectedTunnel.dialParams.ServerEntry.GetDiagnosticID(), 990 connectedTunnel.dialParams.TunnelProtocol, 991 connectedTunnel.dialParams.ServerEntry.SupportsSSHAPIRequests()) 992 993 if isFirstTunnel { 994 995 // Signal a connected request on each 1st tunnel establishment. For 996 // multi-tunnels, the session is connected as long as at least one 997 // tunnel is established. 998 controller.signalConnectedReporter() 999 1000 // If the handshake indicated that a new client version is available, 1001 // trigger an upgrade download. 1002 // Note: serverContext is nil when DisableApi is set 1003 if connectedTunnel.serverContext != nil && 1004 connectedTunnel.serverContext.clientUpgradeVersion != "" { 1005 1006 handshakeVersion := connectedTunnel.serverContext.clientUpgradeVersion 1007 select { 1008 case controller.signalDownloadUpgrade <- handshakeVersion: 1009 default: 1010 } 1011 } 1012 } 1013 1014 // Set the new tunnel as the transport for the packet tunnel. The packet tunnel 1015 // client remains up when reestablishing, but no packets are relayed while there 1016 // is no connected tunnel. UseTunnel will establish a new packet tunnel SSH 1017 // channel over the new SSH tunnel and configure the packet tunnel client to use 1018 // the new SSH channel as its transport. 1019 // 1020 // Note: as is, this logic is suboptimal for tunnelPoolSize > 1, as this would 1021 // continuously initialize new packet tunnel sessions for each established 1022 // server. For now, config validation requires tunnelPoolSize == 1 when 1023 // the packet tunnel is used. 1024 1025 if controller.packetTunnelTransport != nil { 1026 controller.packetTunnelTransport.UseTunnel(connectedTunnel) 1027 } 1028 1029 if controller.isFullyEstablished() { 1030 controller.stopEstablishing() 1031 } 1032 1033 case <-controller.runCtx.Done(): 1034 break loop 1035 } 1036 } 1037 1038 // Stop running 1039 1040 controller.stopEstablishing() 1041 controller.terminateAllTunnels() 1042 1043 // Drain tunnel channels 1044 close(controller.connectedTunnels) 1045 for tunnel := range controller.connectedTunnels { 1046 controller.discardTunnel(tunnel) 1047 } 1048 close(controller.failedTunnels) 1049 for tunnel := range controller.failedTunnels { 1050 controller.discardTunnel(tunnel) 1051 } 1052 1053 NoticeInfo("exiting run tunnels") 1054 } 1055 1056 // SignalSeededNewSLOK implements the TunnelOwner interface. This function 1057 // is called by Tunnel.operateTunnel when the tunnel has received a new, 1058 // previously unknown SLOK from the server. The Controller triggers an OSL 1059 // fetch, as the new SLOK may be sufficient to access new OSLs. 1060 func (controller *Controller) SignalSeededNewSLOK() { 1061 select { 1062 case controller.signalFetchObfuscatedServerLists <- struct{}{}: 1063 default: 1064 } 1065 } 1066 1067 // SignalTunnelFailure implements the TunnelOwner interface. This function 1068 // is called by Tunnel.operateTunnel when the tunnel has detected that it 1069 // has failed. The Controller will signal runTunnels to create a new 1070 // tunnel and/or remove the tunnel from the list of active tunnels. 1071 func (controller *Controller) SignalTunnelFailure(tunnel *Tunnel) { 1072 // Don't block. Assumes the receiver has a buffer large enough for 1073 // the typical number of operated tunnels. In case there's no room, 1074 // terminate the tunnel (runTunnels won't get a signal in this case, 1075 // but the tunnel will be removed from the list of active tunnels). 1076 select { 1077 case controller.failedTunnels <- tunnel: 1078 default: 1079 controller.terminateTunnel(tunnel) 1080 } 1081 } 1082 1083 // discardTunnel disposes of a successful connection that is no longer required. 1084 func (controller *Controller) discardTunnel(tunnel *Tunnel) { 1085 NoticeInfo("discard tunnel: %s", tunnel.dialParams.ServerEntry.GetDiagnosticID()) 1086 // TODO: not calling PromoteServerEntry, since that would rank the 1087 // discarded tunnel before fully active tunnels. Can a discarded tunnel 1088 // be promoted (since it connects), but with lower rank than all active 1089 // tunnels? 1090 tunnel.Close(true) 1091 } 1092 1093 // registerTunnel adds the connected tunnel to the pool of active tunnels 1094 // which are candidates for port forwarding. Returns true if the pool has an 1095 // empty slot and false if the pool is full (caller should discard the tunnel). 1096 func (controller *Controller) registerTunnel(tunnel *Tunnel) bool { 1097 controller.tunnelMutex.Lock() 1098 defer controller.tunnelMutex.Unlock() 1099 if len(controller.tunnels) >= controller.tunnelPoolSize { 1100 return false 1101 } 1102 // Perform a final check just in case we've established 1103 // a duplicate connection. 1104 for _, activeTunnel := range controller.tunnels { 1105 if activeTunnel.dialParams.ServerEntry.IpAddress == 1106 tunnel.dialParams.ServerEntry.IpAddress { 1107 1108 NoticeWarning("duplicate tunnel: %s", tunnel.dialParams.ServerEntry.GetDiagnosticID()) 1109 return false 1110 } 1111 } 1112 controller.establishedOnce = true 1113 controller.tunnels = append(controller.tunnels, tunnel) 1114 NoticeTunnels(len(controller.tunnels)) 1115 1116 // Promote this successful tunnel to first rank so it's one 1117 // of the first candidates next time establish runs. 1118 // Connecting to a TargetServerEntry does not change the 1119 // ranking. 1120 if controller.config.TargetServerEntry == "" { 1121 PromoteServerEntry(controller.config, tunnel.dialParams.ServerEntry.IpAddress) 1122 } 1123 1124 return true 1125 } 1126 1127 // hasEstablishedOnce indicates if at least one active tunnel has 1128 // been established up to this point. This is regardeless of how many 1129 // tunnels are presently active. 1130 func (controller *Controller) hasEstablishedOnce() bool { 1131 controller.tunnelMutex.Lock() 1132 defer controller.tunnelMutex.Unlock() 1133 return controller.establishedOnce 1134 } 1135 1136 // isFullyEstablished indicates if the pool of active tunnels is full. 1137 func (controller *Controller) isFullyEstablished() bool { 1138 controller.tunnelMutex.Lock() 1139 defer controller.tunnelMutex.Unlock() 1140 return len(controller.tunnels) >= controller.tunnelPoolSize 1141 } 1142 1143 // numTunnels returns the number of active and outstanding tunnels. 1144 // Oustanding is the number of tunnels required to fill the pool of 1145 // active tunnels. 1146 func (controller *Controller) numTunnels() (int, int) { 1147 controller.tunnelMutex.Lock() 1148 defer controller.tunnelMutex.Unlock() 1149 active := len(controller.tunnels) 1150 outstanding := controller.tunnelPoolSize - len(controller.tunnels) 1151 return active, outstanding 1152 } 1153 1154 // terminateTunnel removes a tunnel from the pool of active tunnels 1155 // and closes the tunnel. The next-tunnel state used by getNextActiveTunnel 1156 // is adjusted as required. 1157 func (controller *Controller) terminateTunnel(tunnel *Tunnel) { 1158 controller.tunnelMutex.Lock() 1159 defer controller.tunnelMutex.Unlock() 1160 for index, activeTunnel := range controller.tunnels { 1161 if tunnel == activeTunnel { 1162 controller.tunnels = append( 1163 controller.tunnels[:index], controller.tunnels[index+1:]...) 1164 if controller.nextTunnel > index { 1165 controller.nextTunnel-- 1166 } 1167 if controller.nextTunnel >= len(controller.tunnels) { 1168 controller.nextTunnel = 0 1169 } 1170 activeTunnel.Close(false) 1171 NoticeTunnels(len(controller.tunnels)) 1172 break 1173 } 1174 } 1175 } 1176 1177 // terminateAllTunnels empties the tunnel pool, closing all active tunnels. 1178 // This is used when shutting down the controller. 1179 func (controller *Controller) terminateAllTunnels() { 1180 controller.tunnelMutex.Lock() 1181 defer controller.tunnelMutex.Unlock() 1182 // Closing all tunnels in parallel. In an orderly shutdown, each tunnel 1183 // may take a few seconds to send a final status request. We only want 1184 // to wait as long as the single slowest tunnel. 1185 closeWaitGroup := new(sync.WaitGroup) 1186 closeWaitGroup.Add(len(controller.tunnels)) 1187 for _, activeTunnel := range controller.tunnels { 1188 tunnel := activeTunnel 1189 go func() { 1190 defer closeWaitGroup.Done() 1191 tunnel.Close(false) 1192 }() 1193 } 1194 closeWaitGroup.Wait() 1195 controller.tunnels = make([]*Tunnel, 0) 1196 controller.nextTunnel = 0 1197 NoticeTunnels(len(controller.tunnels)) 1198 } 1199 1200 // getNextActiveTunnel returns the next tunnel from the pool of active 1201 // tunnels. Currently, tunnel selection order is simple round-robin. 1202 func (controller *Controller) getNextActiveTunnel() (tunnel *Tunnel) { 1203 controller.tunnelMutex.Lock() 1204 defer controller.tunnelMutex.Unlock() 1205 if len(controller.tunnels) == 0 { 1206 return nil 1207 } 1208 tunnel = controller.tunnels[controller.nextTunnel] 1209 controller.nextTunnel = 1210 (controller.nextTunnel + 1) % len(controller.tunnels) 1211 return tunnel 1212 } 1213 1214 // isActiveTunnelServerEntry is used to check if there's already 1215 // an existing tunnel to a candidate server. 1216 func (controller *Controller) isActiveTunnelServerEntry( 1217 serverEntry *protocol.ServerEntry) bool { 1218 1219 controller.tunnelMutex.Lock() 1220 defer controller.tunnelMutex.Unlock() 1221 for _, activeTunnel := range controller.tunnels { 1222 if activeTunnel.dialParams.ServerEntry.IpAddress == serverEntry.IpAddress { 1223 return true 1224 } 1225 } 1226 return false 1227 } 1228 1229 func (controller *Controller) setTunnelPoolSize(tunnelPoolSize int) { 1230 controller.tunnelMutex.Lock() 1231 defer controller.tunnelMutex.Unlock() 1232 if tunnelPoolSize < 1 { 1233 tunnelPoolSize = 1 1234 } 1235 if tunnelPoolSize > MAX_TUNNEL_POOL_SIZE { 1236 tunnelPoolSize = MAX_TUNNEL_POOL_SIZE 1237 } 1238 controller.tunnelPoolSize = tunnelPoolSize 1239 } 1240 1241 func (controller *Controller) getTunnelPoolSize() int { 1242 controller.tunnelMutex.Lock() 1243 defer controller.tunnelMutex.Unlock() 1244 return controller.tunnelPoolSize 1245 } 1246 1247 // Dial selects an active tunnel and establishes a port forward 1248 // connection through the selected tunnel. Failure to connect is considered 1249 // a port forward failure, for the purpose of monitoring tunnel health. 1250 // 1251 // When split tunnel mode is enabled, the connection may be untunneled, 1252 // depending on GeoIP classification of the destination. 1253 // 1254 // downstreamConn is an optional parameter which specifies a connection to be 1255 // explicitly closed when the dialed connection is closed. For instance, this 1256 // is used to close downstreamConn App<->LocalProxy connections when the 1257 // related LocalProxy<->SshPortForward connections close. 1258 func (controller *Controller) Dial( 1259 remoteAddr string, downstreamConn net.Conn) (conn net.Conn, err error) { 1260 1261 tunnel := controller.getNextActiveTunnel() 1262 if tunnel == nil { 1263 return nil, errors.TraceNew("no active tunnels") 1264 } 1265 1266 if !tunnel.config.IsSplitTunnelEnabled() { 1267 1268 tunneledConn, splitTunnel, err := tunnel.DialTCPChannel( 1269 remoteAddr, false, downstreamConn) 1270 if err != nil { 1271 return nil, errors.Trace(err) 1272 } 1273 1274 if splitTunnel { 1275 return nil, errors.TraceNew( 1276 "unexpected split tunnel classification") 1277 } 1278 1279 return tunneledConn, nil 1280 } 1281 1282 // In split tunnel mode, TCP port forwards to destinations in the same 1283 // country as the client are untunneled. 1284 // 1285 // Split tunnel is implemented with assistence from the server to classify 1286 // destinations as being in the same country as the client. The server knows 1287 // the client's public IP GeoIP data, and, for clients with split tunnel mode 1288 // enabled, the server resolves the port forward destination address and 1289 // checks the destination IP GeoIP data. 1290 // 1291 // When the countries match, the server "rejects" the port forward with a 1292 // distinct response that indicates to the client that an untunneled port 1293 // foward should be established locally. 1294 // 1295 // The client maintains a classification cache that allows it to make 1296 // untunneled port forwards without requiring a round trip to the server. 1297 // Only destinations classified as untunneled are stored in the cache: a 1298 // destination classified as tunneled requires the same round trip as an 1299 // unknown destination. 1300 // 1301 // When the countries do not match, the server establishes a port forward, as 1302 // it does for all port forwards in non-split tunnel mode. There is no 1303 // additional round trip for tunneled port forwards. 1304 1305 splitTunnelHost, _, err := net.SplitHostPort(remoteAddr) 1306 if err != nil { 1307 return nil, errors.Trace(err) 1308 } 1309 1310 untunneledCache := controller.untunneledSplitTunnelClassifications 1311 1312 // If the destination hostname is in the untunneled split tunnel 1313 // classifications cache, skip the round trip to the server and do the 1314 // direct, untunneled dial immediately. 1315 _, cachedUntunneled := untunneledCache.Get(splitTunnelHost) 1316 1317 if !cachedUntunneled { 1318 1319 tunneledConn, splitTunnel, err := tunnel.DialTCPChannel( 1320 remoteAddr, false, downstreamConn) 1321 if err != nil { 1322 return nil, errors.Trace(err) 1323 } 1324 1325 if !splitTunnel { 1326 1327 // Clear any cached untunneled classification entry for this destination 1328 // hostname, as the server is now classifying it as tunneled. 1329 untunneledCache.Delete(splitTunnelHost) 1330 1331 return tunneledConn, nil 1332 } 1333 1334 // The server has indicated that the client should make a direct, 1335 // untunneled dial. Cache the classification to avoid this round trip in 1336 // the immediate future. 1337 untunneledCache.Add(splitTunnelHost, true, lrucache.DefaultExpiration) 1338 } 1339 1340 NoticeUntunneled(splitTunnelHost) 1341 1342 untunneledConn, err := controller.DirectDial(remoteAddr) 1343 if err != nil { 1344 return nil, errors.Trace(err) 1345 } 1346 1347 return untunneledConn, nil 1348 } 1349 1350 // DirectDial dials an untunneled TCP connection within the controller run context. 1351 func (controller *Controller) DirectDial(remoteAddr string) (conn net.Conn, err error) { 1352 return DialTCP(controller.runCtx, remoteAddr, controller.untunneledDialConfig) 1353 } 1354 1355 // triggerFetches signals RSL, OSL, and upgrade download fetchers to begin, if 1356 // not already running. triggerFetches is called when tunnel establishment 1357 // fails to complete within a deadline and in other cases where local 1358 // circumvention capabilities are lacking and we may require new server 1359 // entries or client versions with new capabilities. 1360 func (controller *Controller) triggerFetches() { 1361 1362 // Trigger a common remote server list fetch, since we may have failed 1363 // to connect with all known servers. Don't block sending signal, since 1364 // this signal may have already been sent. 1365 // Don't wait for fetch remote to succeed, since it may fail and 1366 // enter a retry loop and we're better off trying more known servers. 1367 // TODO: synchronize the fetch response, so it can be incorporated 1368 // into the server entry iterator as soon as available. 1369 select { 1370 case controller.signalFetchCommonRemoteServerList <- struct{}{}: 1371 default: 1372 } 1373 1374 // Trigger an OSL fetch in parallel. Both fetches are run in parallel 1375 // so that if one out of the common RLS and OSL set is large, it doesn't 1376 // doesn't entirely block fetching the other. 1377 select { 1378 case controller.signalFetchObfuscatedServerLists <- struct{}{}: 1379 default: 1380 } 1381 1382 // Trigger an out-of-band upgrade availability check and download. 1383 // Since we may have failed to connect, we may benefit from upgrading 1384 // to a new client version with new circumvention capabilities. 1385 select { 1386 case controller.signalDownloadUpgrade <- "": 1387 default: 1388 } 1389 } 1390 1391 type protocolSelectionConstraints struct { 1392 useUpstreamProxy bool 1393 initialLimitTunnelProtocols protocol.TunnelProtocols 1394 initialLimitTunnelProtocolsCandidateCount int 1395 limitTunnelProtocols protocol.TunnelProtocols 1396 limitTunnelDialPortNumbers protocol.TunnelProtocolPortLists 1397 limitQUICVersions protocol.QUICVersions 1398 replayCandidateCount int 1399 } 1400 1401 func (p *protocolSelectionConstraints) hasInitialProtocols() bool { 1402 return len(p.initialLimitTunnelProtocols) > 0 && p.initialLimitTunnelProtocolsCandidateCount > 0 1403 } 1404 1405 func (p *protocolSelectionConstraints) isInitialCandidate( 1406 excludeIntensive bool, 1407 serverEntry *protocol.ServerEntry) bool { 1408 1409 return p.hasInitialProtocols() && 1410 len(serverEntry.GetSupportedProtocols( 1411 conditionallyEnabledComponents{}, 1412 p.useUpstreamProxy, 1413 p.initialLimitTunnelProtocols, 1414 p.limitTunnelDialPortNumbers, 1415 p.limitQUICVersions, 1416 excludeIntensive)) > 0 1417 } 1418 1419 func (p *protocolSelectionConstraints) isCandidate( 1420 excludeIntensive bool, 1421 serverEntry *protocol.ServerEntry) bool { 1422 1423 return len(serverEntry.GetSupportedProtocols( 1424 conditionallyEnabledComponents{}, 1425 p.useUpstreamProxy, 1426 p.limitTunnelProtocols, 1427 p.limitTunnelDialPortNumbers, 1428 p.limitQUICVersions, 1429 excludeIntensive)) > 0 1430 } 1431 1432 func (p *protocolSelectionConstraints) canReplay( 1433 connectTunnelCount int, 1434 excludeIntensive bool, 1435 serverEntry *protocol.ServerEntry, 1436 replayProtocol string) bool { 1437 1438 if p.replayCandidateCount != -1 && connectTunnelCount > p.replayCandidateCount { 1439 return false 1440 } 1441 1442 return common.Contains( 1443 p.supportedProtocols(connectTunnelCount, excludeIntensive, serverEntry), 1444 replayProtocol) 1445 } 1446 1447 func (p *protocolSelectionConstraints) supportedProtocols( 1448 connectTunnelCount int, 1449 excludeIntensive bool, 1450 serverEntry *protocol.ServerEntry) []string { 1451 1452 limitTunnelProtocols := p.limitTunnelProtocols 1453 1454 if len(p.initialLimitTunnelProtocols) > 0 && 1455 p.initialLimitTunnelProtocolsCandidateCount > connectTunnelCount { 1456 1457 limitTunnelProtocols = p.initialLimitTunnelProtocols 1458 } 1459 1460 return serverEntry.GetSupportedProtocols( 1461 conditionallyEnabledComponents{}, 1462 p.useUpstreamProxy, 1463 limitTunnelProtocols, 1464 p.limitTunnelDialPortNumbers, 1465 p.limitQUICVersions, 1466 excludeIntensive) 1467 } 1468 1469 func (p *protocolSelectionConstraints) selectProtocol( 1470 connectTunnelCount int, 1471 excludeIntensive bool, 1472 serverEntry *protocol.ServerEntry) (string, bool) { 1473 1474 candidateProtocols := p.supportedProtocols(connectTunnelCount, excludeIntensive, serverEntry) 1475 1476 if len(candidateProtocols) == 0 { 1477 return "", false 1478 } 1479 1480 // Pick at random from the supported protocols. This ensures that we'll 1481 // eventually try all possible protocols. Depending on network 1482 // configuration, it may be the case that some protocol is only available 1483 // through multi-capability servers, and a simpler ranked preference of 1484 // protocols could lead to that protocol never being selected. 1485 1486 index := prng.Intn(len(candidateProtocols)) 1487 1488 return candidateProtocols[index], true 1489 1490 } 1491 1492 type candidateServerEntry struct { 1493 serverEntry *protocol.ServerEntry 1494 isServerAffinityCandidate bool 1495 adjustedEstablishStartTime time.Time 1496 } 1497 1498 // startEstablishing creates a pool of worker goroutines which will 1499 // attempt to establish tunnels to candidate servers. The candidates 1500 // are generated by another goroutine. 1501 func (controller *Controller) startEstablishing() { 1502 if controller.isEstablishing { 1503 return 1504 } 1505 NoticeInfo("start establishing") 1506 1507 // establishStartTime is used to calculate and report the client's tunnel 1508 // establishment duration. Establishment duration should include all 1509 // initialization in launchEstablishing and establishCandidateGenerator, 1510 // including any potentially long-running datastore iterations. 1511 establishStartTime := time.Now() 1512 1513 controller.concurrentEstablishTunnelsMutex.Lock() 1514 controller.establishConnectTunnelCount = 0 1515 controller.concurrentEstablishTunnels = 0 1516 controller.concurrentIntensiveEstablishTunnels = 0 1517 controller.peakConcurrentEstablishTunnels = 0 1518 controller.peakConcurrentIntensiveEstablishTunnels = 0 1519 controller.concurrentEstablishTunnelsMutex.Unlock() 1520 1521 DoGarbageCollection() 1522 emitMemoryMetrics() 1523 1524 // The establish context cancelFunc, controller.stopEstablish, is called in 1525 // controller.stopEstablishing. 1526 1527 controller.isEstablishing = true 1528 controller.establishStartTime = establishStartTime 1529 controller.establishCtx, controller.stopEstablish = context.WithCancel(controller.runCtx) 1530 controller.establishWaitGroup = new(sync.WaitGroup) 1531 controller.candidateServerEntries = make(chan *candidateServerEntry) 1532 1533 // The server affinity mechanism attempts to favor the previously 1534 // used server when reconnecting. This is beneficial for user 1535 // applications which expect consistency in user IP address (for 1536 // example, a web site which prompts for additional user 1537 // authentication when the IP address changes). 1538 // 1539 // Only the very first server, as determined by 1540 // datastore.PromoteServerEntry(), is the server affinity candidate. 1541 // Concurrent connections attempts to many servers are launched 1542 // without delay, in case the affinity server connection fails. 1543 // While the affinity server connection is outstanding, when any 1544 // other connection is established, there is a short grace period 1545 // delay before delivering the established tunnel; this allows some 1546 // time for the affinity server connection to succeed first. 1547 // When the affinity server connection fails, any other established 1548 // tunnel is registered without delay. 1549 // 1550 // Note: the establishTunnelWorker that receives the affinity 1551 // candidate is solely resonsible for closing 1552 // controller.serverAffinityDoneBroadcast. 1553 controller.serverAffinityDoneBroadcast = make(chan struct{}) 1554 1555 controller.establishWaitGroup.Add(1) 1556 go controller.launchEstablishing() 1557 } 1558 1559 func (controller *Controller) launchEstablishing() { 1560 1561 defer controller.establishWaitGroup.Done() 1562 1563 // Before starting the establish tunnel workers, get and apply 1564 // tactics, launching a tactics request if required. 1565 // 1566 // Wait only TacticsWaitPeriod for the tactics request to complete (or 1567 // fail) before proceeding with tunnel establishment, in case the tactics 1568 // request is blocked or takes very long to complete. 1569 // 1570 // An in-flight tactics request uses meek in round tripper mode, which 1571 // uses less resources than meek tunnel relay mode. For this reason, the 1572 // tactics request is not counted in concurrentIntensiveEstablishTunnels. 1573 // 1574 // TODO: HTTP/2 uses significantly more memory, so perhaps 1575 // concurrentIntensiveEstablishTunnels should be counted in that case. 1576 // 1577 // Any in-flight tactics request or pending retry will be 1578 // canceled when establishment is stopped. 1579 1580 if !controller.config.DisableTactics { 1581 1582 timeout := controller.config.GetParameters().Get().Duration( 1583 parameters.TacticsWaitPeriod) 1584 1585 tacticsDone := make(chan struct{}) 1586 tacticsWaitPeriod := time.NewTimer(timeout) 1587 defer tacticsWaitPeriod.Stop() 1588 1589 controller.establishWaitGroup.Add(1) 1590 go func() { 1591 defer controller.establishWaitGroup.Done() 1592 defer close(tacticsDone) 1593 GetTactics(controller.establishCtx, controller.config) 1594 }() 1595 1596 select { 1597 case <-tacticsDone: 1598 case <-tacticsWaitPeriod.C: 1599 } 1600 1601 tacticsWaitPeriod.Stop() 1602 1603 if controller.isStopEstablishing() { 1604 // This check isn't strictly required but avoids the overhead of launching 1605 // workers if establishment stopped while awaiting a tactics request. 1606 return 1607 } 1608 } 1609 1610 // Initial- and LimitTunnelProtocols may be set by tactics. 1611 // 1612 // These protocol limits are fixed once per establishment, for 1613 // consistent application of related probabilities (applied by 1614 // ParametersAccessor.TunnelProtocols). The 1615 // establishLimitTunnelProtocolsState field must be read-only after this 1616 // point, allowing concurrent reads by establishment workers. 1617 1618 p := controller.config.GetParameters().Get() 1619 1620 controller.protocolSelectionConstraints = &protocolSelectionConstraints{ 1621 useUpstreamProxy: controller.config.UseUpstreamProxy(), 1622 initialLimitTunnelProtocols: p.TunnelProtocols(parameters.InitialLimitTunnelProtocols), 1623 initialLimitTunnelProtocolsCandidateCount: p.Int(parameters.InitialLimitTunnelProtocolsCandidateCount), 1624 limitTunnelProtocols: p.TunnelProtocols(parameters.LimitTunnelProtocols), 1625 1626 limitTunnelDialPortNumbers: protocol.TunnelProtocolPortLists( 1627 p.TunnelProtocolPortLists(parameters.LimitTunnelDialPortNumbers)), 1628 1629 replayCandidateCount: p.Int(parameters.ReplayCandidateCount), 1630 } 1631 1632 // ConnectionWorkerPoolSize may be set by tactics. 1633 1634 workerPoolSize := p.Int(parameters.ConnectionWorkerPoolSize) 1635 1636 // When TargetServerEntry is used, override any worker pool size config or 1637 // tactic parameter and use a pool size of 1. The typical use case for 1638 // TargetServerEntry is to test a specific server with a single connection 1639 // attempt. Furthermore, too many concurrent attempts to connect to the 1640 // same server will trigger rate limiting. 1641 if controller.config.TargetServerEntry != "" { 1642 workerPoolSize = 1 1643 } 1644 1645 // TunnelPoolSize may be set by tactics, subject to local constraints. A pool 1646 // size of one is forced in packet tunnel mode or when using a 1647 // TargetServerEntry. The tunnel pool size is reduced when there are 1648 // insufficent known server entries, within the set region and protocol 1649 // constraints, to satisfy the target. 1650 // 1651 // Limitations, to simplify concurrent access to shared state: a ceiling of 1652 // MAX_TUNNEL_POOL_SIZE is enforced by setTunnelPoolSize; the tunnel pool 1653 // size target is not re-adjusted after an API handshake, even though the 1654 // handshake response may deliver new tactics, or prune server entries which 1655 // were potential candidates; nor is the target re-adjusted after fetching 1656 // new server entries during this establishment. 1657 1658 tunnelPoolSize := p.Int(parameters.TunnelPoolSize) 1659 if controller.config.PacketTunnelTunFileDescriptor > 0 || 1660 controller.config.TargetServerEntry != "" { 1661 tunnelPoolSize = 1 1662 } 1663 1664 p.Close() 1665 1666 // Trigger CandidateServers and AvailableEgressRegions notices. By default, 1667 // this is an asynchronous operation, as the underlying full server entry 1668 // list enumeration may be a slow operation. In certain cases, where 1669 // candidate counts are required up front, await the result before 1670 // proceeding. 1671 1672 awaitResponse := tunnelPoolSize > 1 || 1673 controller.protocolSelectionConstraints.initialLimitTunnelProtocolsCandidateCount > 0 1674 1675 // AvailableEgressRegions: after a fresh install, the outer client may not 1676 // have a list of regions to display; and LimitTunnelProtocols may reduce the 1677 // number of available regions. 1678 // 1679 // When the outer client receives NoticeAvailableEgressRegions and the 1680 // configured EgressRegion is not included in the region list, the outer 1681 // client _should_ stop tunnel-core and prompt the user to change the region 1682 // selection, as there are insufficient servers/capabilities to establish a 1683 // tunnel in the selected region. 1684 // 1685 // This report is delayed until after tactics are likely to be applied, 1686 // above; this avoids a ReportAvailableRegions reporting too many regions, 1687 // followed shortly by a ReportAvailableRegions reporting fewer regions. That 1688 // sequence could cause issues in the outer client UI. 1689 // 1690 // The reported regions are limited by protocolSelectionConstraints; in the 1691 // case where an initial limit is in place, only regions available for the 1692 // initial limit are reported. The initial phase will not complete if 1693 // EgressRegion is set such that there are no server entries with the 1694 // necessary protocol capabilities (either locally or from a remote server 1695 // list fetch). 1696 1697 // Concurrency note: controller.protocolSelectionConstraints may be 1698 // overwritten before serverEntriesReporter reads it, and so cannot be 1699 // accessed directly by serverEntriesReporter. 1700 reportRequest := &serverEntriesReportRequest{ 1701 constraints: controller.protocolSelectionConstraints, 1702 } 1703 1704 if awaitResponse { 1705 // Buffer size of 1 ensures the sender, serverEntryReporter, won't block on 1706 // sending the response in the case where launchEstablishing exits due to 1707 // stopping establishment. 1708 reportRequest.awaitResponse = make(chan *serverEntriesReportResponse, 1) 1709 } 1710 1711 controller.signalServerEntriesReporter(reportRequest) 1712 1713 if awaitResponse { 1714 1715 var reportResponse *serverEntriesReportResponse 1716 select { 1717 case reportResponse = <-reportRequest.awaitResponse: 1718 case <-controller.establishCtx.Done(): 1719 // The sender may be gone when shutting down, or may not send until after 1720 // stopping establishment. 1721 return 1722 } 1723 if reportResponse.err != nil { 1724 NoticeError("failed to report server entries: %v", 1725 errors.Trace(reportResponse.err)) 1726 controller.SignalComponentFailure() 1727 return 1728 } 1729 1730 // Make adjustments based on candidate counts. 1731 1732 if tunnelPoolSize > 1 { 1733 // Initial canidate count is ignored as count candidates will eventually 1734 // become available. 1735 if reportResponse.candidates < tunnelPoolSize { 1736 tunnelPoolSize = reportResponse.candidates 1737 } 1738 if tunnelPoolSize < 1 { 1739 tunnelPoolSize = 1 1740 } 1741 } 1742 controller.setTunnelPoolSize(tunnelPoolSize) 1743 1744 // If InitialLimitTunnelProtocols is configured but cannot be satisfied, 1745 // skip the initial phase in this establishment. This avoids spinning, 1746 // unable to connect, in this case. InitialLimitTunnelProtocols is 1747 // intended to prioritize certain protocols, but not strictly select them. 1748 // 1749 // The candidate count check ignores egress region selection. When an egress 1750 // region is selected, it's the responsibility of the outer client to react 1751 // to the following ReportAvailableRegions output and clear the user's 1752 // selected region to prevent spinning, unable to connect. The initial phase 1753 // is skipped only when InitialLimitTunnelProtocols cannot be satisfied 1754 // _regardless_ of region selection. 1755 // 1756 // We presume that, in practise, most clients will have embedded server 1757 // entries with capabilities for most protocols; and that clients will 1758 // often perform RSL checks. So clients should most often have the 1759 // necessary capabilities to satisfy InitialLimitTunnelProtocols. When 1760 // this check fails, RSL/OSL/upgrade checks are triggered in order to gain 1761 // new capabilities. 1762 // 1763 // LimitTunnelProtocols remains a hard limit, as using prohibited 1764 // protocols may have some bad effect, such as a firewall blocking all 1765 // traffic from a host. 1766 1767 if controller.protocolSelectionConstraints.initialLimitTunnelProtocolsCandidateCount > 0 { 1768 1769 if reportResponse.initialCandidatesAnyEgressRegion == 0 { 1770 NoticeWarning("skipping initial limit tunnel protocols") 1771 controller.protocolSelectionConstraints.initialLimitTunnelProtocolsCandidateCount = 0 1772 1773 // Since we were unable to satisfy the InitialLimitTunnelProtocols 1774 // tactic, trigger RSL, OSL, and upgrade fetches to potentially 1775 // gain new capabilities. 1776 controller.triggerFetches() 1777 } 1778 } 1779 } 1780 1781 for i := 0; i < workerPoolSize; i++ { 1782 controller.establishWaitGroup.Add(1) 1783 go controller.establishTunnelWorker() 1784 } 1785 1786 controller.establishWaitGroup.Add(1) 1787 go controller.establishCandidateGenerator() 1788 } 1789 1790 // stopEstablishing signals the establish goroutines to stop and waits 1791 // for the group to halt. 1792 func (controller *Controller) stopEstablishing() { 1793 if !controller.isEstablishing { 1794 return 1795 } 1796 NoticeInfo("stop establishing") 1797 controller.stopEstablish() 1798 // Note: establishCandidateGenerator closes controller.candidateServerEntries 1799 // (as it may be sending to that channel). 1800 controller.establishWaitGroup.Wait() 1801 NoticeInfo("stopped establishing") 1802 1803 controller.isEstablishing = false 1804 controller.establishStartTime = time.Time{} 1805 controller.establishCtx = nil 1806 controller.stopEstablish = nil 1807 controller.establishWaitGroup = nil 1808 controller.candidateServerEntries = nil 1809 controller.serverAffinityDoneBroadcast = nil 1810 1811 controller.concurrentEstablishTunnelsMutex.Lock() 1812 peakConcurrent := controller.peakConcurrentEstablishTunnels 1813 peakConcurrentIntensive := controller.peakConcurrentIntensiveEstablishTunnels 1814 controller.establishConnectTunnelCount = 0 1815 controller.concurrentEstablishTunnels = 0 1816 controller.concurrentIntensiveEstablishTunnels = 0 1817 controller.peakConcurrentEstablishTunnels = 0 1818 controller.peakConcurrentIntensiveEstablishTunnels = 0 1819 controller.concurrentEstablishTunnelsMutex.Unlock() 1820 NoticeInfo("peak concurrent establish tunnels: %d", peakConcurrent) 1821 NoticeInfo("peak concurrent resource intensive establish tunnels: %d", peakConcurrentIntensive) 1822 1823 emitMemoryMetrics() 1824 DoGarbageCollection() 1825 1826 // Record datastore metrics after establishment, the phase which generates 1827 // the bulk of all datastore transactions: iterating over server entries, 1828 // storing new server entries, etc. 1829 emitDatastoreMetrics() 1830 1831 // Similarly, establishment generates the bulk of domain resolves. 1832 emitDNSMetrics(controller.resolver) 1833 } 1834 1835 // establishCandidateGenerator populates the candidate queue with server entries 1836 // from the data store. Server entries are iterated in rank order, so that promoted 1837 // servers with higher rank are priority candidates. 1838 func (controller *Controller) establishCandidateGenerator() { 1839 defer controller.establishWaitGroup.Done() 1840 defer close(controller.candidateServerEntries) 1841 1842 // networkWaitDuration is the elapsed time spent waiting 1843 // for network connectivity. This duration will be excluded 1844 // from reported tunnel establishment duration. 1845 var totalNetworkWaitDuration time.Duration 1846 1847 applyServerAffinity, iterator, err := NewServerEntryIterator(controller.config) 1848 if err != nil { 1849 NoticeError("failed to iterate over candidates: %v", errors.Trace(err)) 1850 controller.SignalComponentFailure() 1851 return 1852 } 1853 defer iterator.Close() 1854 1855 // TODO: reconcile server affinity scheme with multi-tunnel mode 1856 if controller.getTunnelPoolSize() > 1 { 1857 applyServerAffinity = false 1858 } 1859 1860 isServerAffinityCandidate := true 1861 if !applyServerAffinity { 1862 isServerAffinityCandidate = false 1863 close(controller.serverAffinityDoneBroadcast) 1864 } 1865 1866 loop: 1867 // Repeat until stopped 1868 for { 1869 1870 // A "round" consists of a new shuffle of the server entries and attempted 1871 // connections up to the end of the server entry iterator, or 1872 // parameters.EstablishTunnelWorkTime elapsed. Time spent waiting for 1873 // network connectivity is excluded from round elapsed time. 1874 // 1875 // After a round, if parameters.EstablishTunnelWorkTime has elapsed in total 1876 // with no tunnel established, remote server list and upgrade checks are 1877 // triggered. 1878 // 1879 // A complete server entry iteration does not trigger fetches since it's 1880 // possible to have fewer than parameters.ConnectionWorkerPoolSize 1881 // candidates, in which case rounds end instantly due to the complete server 1882 // entry iteration. An exception is made for an empty server entry iterator; 1883 // in that case fetches may be triggered immediately. 1884 // 1885 // The number of server candidates may change during this loop, due to 1886 // remote server list fetches. Due to the performance impact, we will not 1887 // trigger additional, informational CandidateServer notices while in the 1888 // establishing loop. Clients typically re-establish often enough that we 1889 // will see the effect of the remote server list fetch in diagnostics. 1890 1891 roundStartTime := time.Now() 1892 var roundNetworkWaitDuration time.Duration 1893 1894 workTime := controller.config.GetParameters().Get().Duration( 1895 parameters.EstablishTunnelWorkTime) 1896 1897 candidateServerEntryCount := 0 1898 1899 // Send each iterator server entry to the establish workers 1900 for { 1901 1902 networkWaitStartTime := time.Now() 1903 if !WaitForNetworkConnectivity( 1904 controller.establishCtx, 1905 controller.config.NetworkConnectivityChecker) { 1906 break loop 1907 } 1908 networkWaitDuration := time.Since(networkWaitStartTime) 1909 roundNetworkWaitDuration += networkWaitDuration 1910 totalNetworkWaitDuration += networkWaitDuration 1911 1912 serverEntry, err := iterator.Next() 1913 if err != nil { 1914 NoticeError("failed to get next candidate: %v", errors.Trace(err)) 1915 controller.SignalComponentFailure() 1916 break loop 1917 } 1918 if serverEntry == nil { 1919 // Completed this iteration 1920 NoticeInfo("completed server entry iteration") 1921 break 1922 } 1923 1924 if controller.config.TargetApiProtocol == protocol.PSIPHON_SSH_API_PROTOCOL && 1925 !serverEntry.SupportsSSHAPIRequests() { 1926 continue 1927 } 1928 1929 candidateServerEntryCount += 1 1930 1931 // adjustedEstablishStartTime is establishStartTime shifted 1932 // to exclude time spent waiting for network connectivity. 1933 adjustedEstablishStartTime := controller.establishStartTime.Add( 1934 totalNetworkWaitDuration) 1935 1936 candidate := &candidateServerEntry{ 1937 serverEntry: serverEntry, 1938 isServerAffinityCandidate: isServerAffinityCandidate, 1939 adjustedEstablishStartTime: adjustedEstablishStartTime, 1940 } 1941 1942 wasServerAffinityCandidate := isServerAffinityCandidate 1943 1944 // Note: there must be only one server affinity candidate, as it 1945 // closes the serverAffinityDoneBroadcast channel. 1946 isServerAffinityCandidate = false 1947 1948 // TODO: here we could generate multiple candidates from the 1949 // server entry when there are many MeekFrontingAddresses. 1950 1951 select { 1952 case controller.candidateServerEntries <- candidate: 1953 case <-controller.establishCtx.Done(): 1954 break loop 1955 } 1956 1957 if time.Since(roundStartTime)-roundNetworkWaitDuration > workTime { 1958 // Start over, after a brief pause, with a new shuffle of the server 1959 // entries, and potentially some newly fetched server entries. 1960 break 1961 } 1962 1963 if wasServerAffinityCandidate { 1964 1965 // Don't start the next candidate until either the server affinity 1966 // candidate has completed (success or failure) or is still working 1967 // and the grace period has elapsed. 1968 1969 gracePeriod := controller.config.GetParameters().Get().Duration( 1970 parameters.EstablishTunnelServerAffinityGracePeriod) 1971 1972 if gracePeriod > 0 { 1973 timer := time.NewTimer(gracePeriod) 1974 select { 1975 case <-timer.C: 1976 case <-controller.serverAffinityDoneBroadcast: 1977 case <-controller.establishCtx.Done(): 1978 timer.Stop() 1979 break loop 1980 } 1981 timer.Stop() 1982 } 1983 } 1984 } 1985 1986 // Free up resources now, but don't reset until after the pause. 1987 iterator.Close() 1988 1989 // Trigger RSL, OSL, and upgrade checks after failing to establish a 1990 // tunnel within parameters.EstablishTunnelWorkTime, or if there are 1991 // no server entries present. 1992 // 1993 // While the trigger is made after each round, 1994 // parameter.FetchRemoteServerListStalePeriod will limit the actual 1995 // frequency of fetches. Continuing to trigger allows for very long running 1996 // establishments to perhaps eventually succeed. 1997 // 1998 // No fetches are triggered when TargetServerEntry is specified. In that 1999 // case, we're only trying to connect to a specific server entry. 2000 2001 if (candidateServerEntryCount == 0 || 2002 time.Since(controller.establishStartTime)-totalNetworkWaitDuration > workTime) && 2003 controller.config.TargetServerEntry == "" { 2004 2005 controller.triggerFetches() 2006 } 2007 2008 // After a complete iteration of candidate servers, pause before iterating again. 2009 // This helps avoid some busy wait loop conditions, and also allows some time for 2010 // network conditions to change. Also allows for fetch remote to complete, 2011 // in typical conditions (it isn't strictly necessary to wait for this, there will 2012 // be more rounds if required). 2013 2014 p := controller.config.GetParameters().Get() 2015 timeout := prng.JitterDuration( 2016 p.Duration(parameters.EstablishTunnelPausePeriod), 2017 p.Float(parameters.EstablishTunnelPausePeriodJitter)) 2018 p.Close() 2019 2020 timer := time.NewTimer(timeout) 2021 select { 2022 case <-timer.C: 2023 // Retry iterating 2024 case <-controller.establishCtx.Done(): 2025 timer.Stop() 2026 break loop 2027 } 2028 timer.Stop() 2029 2030 iterator.Reset() 2031 } 2032 } 2033 2034 // establishTunnelWorker pulls candidates from the candidate queue, establishes 2035 // a connection to the tunnel server, and delivers the connected tunnel to a channel. 2036 func (controller *Controller) establishTunnelWorker() { 2037 defer controller.establishWaitGroup.Done() 2038 loop: 2039 for candidateServerEntry := range controller.candidateServerEntries { 2040 2041 // Note: don't receive from candidateServerEntries and isStopEstablishing 2042 // in the same select, since we want to prioritize receiving the stop signal 2043 if controller.isStopEstablishing() { 2044 break loop 2045 } 2046 2047 // There may already be a tunnel to this candidate. If so, skip it. 2048 if controller.isActiveTunnelServerEntry(candidateServerEntry.serverEntry) { 2049 continue 2050 } 2051 2052 // TODO: we allow multiple, concurrent workers to attempt to connect to the 2053 // same server. This is not wasteful if the server supports several 2054 // different protocols, some of which may be blocked while others are not 2055 // blocked. Limiting protocols with [Initial]LimitTunnelProtocols may make 2056 // these multiple attempts redundent. Also, replay should be used only by 2057 // the first attempt. 2058 2059 // upstreamProxyErrorCallback will post NoticeUpstreamProxyError when the 2060 // tunnel dial fails due to an upstream proxy error. As the upstream proxy 2061 // is user configured, the error message may need to be relayed to the user. 2062 2063 // As the callback may be invoked after establishment is over (e.g., if an 2064 // initial dial isn't fully shutdown when ConnectTunnel returns; or a meek 2065 // underlying TCP connection re-dial) don't access these variables 2066 // directly. 2067 callbackCandidateServerEntry := candidateServerEntry 2068 callbackEstablishCtx := controller.establishCtx 2069 2070 upstreamProxyErrorCallback := func(err error) { 2071 2072 // Do not post the notice when overall establishment context is canceled or 2073 // timed-out: the upstream proxy connection error is likely a result of the 2074 // cancellation, and not a condition to be fixed by the user. In the case 2075 // of meek underlying TCP connection re-dials, this condition will always 2076 // be true; however in this case the initial dial succeeded with the 2077 // current upstream proxy settings, so any upstream proxy error is 2078 // transient. 2079 if callbackEstablishCtx.Err() != nil { 2080 return 2081 } 2082 2083 // Another class of non-fatal upstream proxy error arises from proxies 2084 // which limit permitted proxied ports. In this case, some tunnels may fail 2085 // due to dial port, while others may eventually succeed. To avoid this 2086 // class of errors, delay posting the notice. If the upstream proxy works, 2087 // _some_ tunnel should connect. If the upstream proxy configuration is 2088 // broken, the error should persist and eventually get posted. 2089 2090 p := controller.config.GetParameters().Get() 2091 workerPoolSize := p.Int(parameters.ConnectionWorkerPoolSize) 2092 minWaitDuration := p.Duration(parameters.UpstreamProxyErrorMinWaitDuration) 2093 maxWaitDuration := p.Duration(parameters.UpstreamProxyErrorMaxWaitDuration) 2094 p.Close() 2095 2096 controller.concurrentEstablishTunnelsMutex.Lock() 2097 establishConnectTunnelCount := controller.establishConnectTunnelCount 2098 controller.concurrentEstablishTunnelsMutex.Unlock() 2099 2100 // Delay UpstreamProxyErrorMinWaitDuration (excluding time spent waiting 2101 // for network connectivity) and then until either 2102 // UpstreamProxyErrorMaxWaitDuration has elapsed or, to post sooner if many 2103 // candidates are failing, at least workerPoolSize tunnel connection 2104 // attempts have completed. We infer that at least workerPoolSize 2105 // candidates have completed by checking that at least 2*workerPoolSize 2106 // candidates have started. 2107 2108 elapsedTime := time.Since( 2109 callbackCandidateServerEntry.adjustedEstablishStartTime) 2110 2111 if elapsedTime < minWaitDuration || 2112 (elapsedTime < maxWaitDuration && 2113 establishConnectTunnelCount < 2*workerPoolSize) { 2114 return 2115 } 2116 2117 NoticeUpstreamProxyError(err) 2118 } 2119 2120 // Select the tunnel protocol. The selection will be made at random 2121 // from protocols supported by the server entry, optionally limited by 2122 // LimitTunnelProtocols. 2123 // 2124 // When limiting concurrent resource intensive protocol connection 2125 // workers, and at the limit, do not select resource intensive 2126 // protocols since otherwise the candidate must be skipped. 2127 // 2128 // If at the limit and unabled to select a non-intensive protocol, 2129 // skip the candidate entirely and move on to the next. Since 2130 // candidates are shuffled it's likely that the next candidate is not 2131 // intensive. In this case, a StaggerConnectionWorkersMilliseconds 2132 // delay may still be incurred. 2133 2134 limitIntensiveConnectionWorkers := controller.config.GetParameters().Get().Int( 2135 parameters.LimitIntensiveConnectionWorkers) 2136 2137 controller.concurrentEstablishTunnelsMutex.Lock() 2138 2139 excludeIntensive := false 2140 if limitIntensiveConnectionWorkers > 0 && 2141 controller.concurrentIntensiveEstablishTunnels >= limitIntensiveConnectionWorkers { 2142 excludeIntensive = true 2143 } 2144 2145 canReplay := func(serverEntry *protocol.ServerEntry, replayProtocol string) bool { 2146 return controller.protocolSelectionConstraints.canReplay( 2147 controller.establishConnectTunnelCount, 2148 excludeIntensive, 2149 serverEntry, 2150 replayProtocol) 2151 } 2152 2153 selectProtocol := func(serverEntry *protocol.ServerEntry) (string, bool) { 2154 return controller.protocolSelectionConstraints.selectProtocol( 2155 controller.establishConnectTunnelCount, 2156 excludeIntensive, 2157 serverEntry) 2158 } 2159 2160 // MakeDialParameters may return a replay instance, if the server 2161 // entry has a previous, recent successful connection and 2162 // tactics/config has not changed. 2163 // 2164 // In the first round -- and later rounds, with some probability -- of 2165 // establishing, ServerEntryIterator will move potential replay candidates 2166 // to the front of the iterator after the random shuffle, which greatly 2167 // prioritizes previously successful servers for that round. 2168 // 2169 // As ServerEntryIterator does not unmarshal and validate replay 2170 // candidate dial parameters, some potential replay candidates may 2171 // have expired or otherwise ineligible dial parameters; in this case 2172 // the candidate proceeds without replay. 2173 // 2174 // The ReplayCandidateCount tactic determines how many candidates may use 2175 // replay. After ReplayCandidateCount candidates of any type, replay or no, 2176 // replay is skipped. If ReplayCandidateCount exceeds the intial round, 2177 // replay may still be performed but the iterator may no longer move 2178 // potential replay server entries to the front. When ReplayCandidateCount 2179 // is set to -1, unlimited candidates may use replay. 2180 2181 dialParams, err := MakeDialParameters( 2182 controller.config, 2183 upstreamProxyErrorCallback, 2184 canReplay, 2185 selectProtocol, 2186 candidateServerEntry.serverEntry, 2187 false, 2188 controller.establishConnectTunnelCount, 2189 int(atomic.LoadInt32(&controller.establishedTunnelsCount))) 2190 if dialParams == nil || err != nil { 2191 2192 controller.concurrentEstablishTunnelsMutex.Unlock() 2193 2194 // MakeDialParameters returns nil/nil when the server entry is to 2195 // be skipped. See MakeDialParameters for skip cases and skip 2196 // logging. Silently fail the candidate in this case. Otherwise, 2197 // emit error. 2198 if err != nil { 2199 NoticeInfo("failed to make dial parameters for %s: %v", 2200 candidateServerEntry.serverEntry.GetDiagnosticID(), 2201 errors.Trace(err)) 2202 } 2203 2204 // Unblock other candidates immediately when server affinity 2205 // candidate is skipped. 2206 if candidateServerEntry.isServerAffinityCandidate { 2207 close(controller.serverAffinityDoneBroadcast) 2208 } 2209 2210 continue 2211 } 2212 2213 // Increment establishConnectTunnelCount only after selectProtocol has 2214 // succeeded to ensure InitialLimitTunnelProtocolsCandidateCount 2215 // candidates use InitialLimitTunnelProtocols. 2216 establishConnectTunnelCount := controller.establishConnectTunnelCount 2217 controller.establishConnectTunnelCount += 1 2218 2219 isIntensive := protocol.TunnelProtocolIsResourceIntensive(dialParams.TunnelProtocol) 2220 2221 if isIntensive { 2222 controller.concurrentIntensiveEstablishTunnels += 1 2223 if controller.concurrentIntensiveEstablishTunnels > controller.peakConcurrentIntensiveEstablishTunnels { 2224 controller.peakConcurrentIntensiveEstablishTunnels = controller.concurrentIntensiveEstablishTunnels 2225 } 2226 } 2227 controller.concurrentEstablishTunnels += 1 2228 if controller.concurrentEstablishTunnels > controller.peakConcurrentEstablishTunnels { 2229 controller.peakConcurrentEstablishTunnels = controller.concurrentEstablishTunnels 2230 } 2231 2232 controller.concurrentEstablishTunnelsMutex.Unlock() 2233 2234 // Apply stagger only now that we're past MakeDialParameters and 2235 // protocol selection logic which may have caused the candidate to be 2236 // skipped. The stagger logic delays dialing, and we don't want to 2237 // incur that delay that when skipping. 2238 // 2239 // Locking staggerMutex serializes staggers, so that multiple workers 2240 // don't simply sleep in parallel. 2241 // 2242 // The stagger is applied when establishConnectTunnelCount > 0 -- that 2243 // is, for all but the first dial. 2244 2245 p := controller.config.GetParameters().Get() 2246 staggerPeriod := p.Duration(parameters.StaggerConnectionWorkersPeriod) 2247 staggerJitter := p.Float(parameters.StaggerConnectionWorkersJitter) 2248 p.Close() 2249 2250 if establishConnectTunnelCount > 0 && staggerPeriod != 0 { 2251 controller.staggerMutex.Lock() 2252 timer := time.NewTimer(prng.JitterDuration(staggerPeriod, staggerJitter)) 2253 select { 2254 case <-timer.C: 2255 case <-controller.establishCtx.Done(): 2256 } 2257 timer.Stop() 2258 controller.staggerMutex.Unlock() 2259 } 2260 2261 // ConnectTunnel will allocate significant memory, so first attempt to 2262 // reclaim as much as possible. 2263 DoGarbageCollection() 2264 2265 tunnel, err := ConnectTunnel( 2266 controller.establishCtx, 2267 controller.config, 2268 candidateServerEntry.adjustedEstablishStartTime, 2269 dialParams) 2270 2271 controller.concurrentEstablishTunnelsMutex.Lock() 2272 if isIntensive { 2273 controller.concurrentIntensiveEstablishTunnels -= 1 2274 } 2275 controller.concurrentEstablishTunnels -= 1 2276 controller.concurrentEstablishTunnelsMutex.Unlock() 2277 2278 // Periodically emit memory metrics during the establishment cycle. 2279 if !controller.isStopEstablishing() { 2280 emitMemoryMetrics() 2281 } 2282 2283 // Immediately reclaim memory allocated by the establishment. In the case 2284 // of failure, first clear the reference to the tunnel. In the case of 2285 // success, the garbage collection may still be effective as the initial 2286 // phases of some protocols involve significant memory allocation that 2287 // could now be reclaimed. 2288 if err != nil { 2289 tunnel = nil 2290 } 2291 DoGarbageCollection() 2292 2293 if err != nil { 2294 2295 // Unblock other candidates immediately when server affinity 2296 // candidate fails. 2297 if candidateServerEntry.isServerAffinityCandidate { 2298 close(controller.serverAffinityDoneBroadcast) 2299 } 2300 2301 // Before emitting error, check if establish interrupted, in which 2302 // case the error is noise. 2303 if controller.isStopEstablishing() { 2304 break loop 2305 } 2306 2307 NoticeInfo("failed to connect to %s: %v", 2308 candidateServerEntry.serverEntry.GetDiagnosticID(), 2309 errors.Trace(err)) 2310 2311 continue 2312 } 2313 2314 // Deliver connected tunnel. 2315 // Don't block. Assumes the receiver has a buffer large enough for 2316 // the number of desired tunnels. If there's no room, the tunnel must 2317 // not be required so it's discarded. 2318 select { 2319 case controller.connectedTunnels <- tunnel: 2320 default: 2321 controller.discardTunnel(tunnel) 2322 2323 // Clear the reference to this discarded tunnel and immediately run 2324 // a garbage collection to reclaim its memory. 2325 tunnel = nil 2326 DoGarbageCollection() 2327 } 2328 2329 // Unblock other candidates only after delivering when 2330 // server affinity candidate succeeds. 2331 if candidateServerEntry.isServerAffinityCandidate { 2332 close(controller.serverAffinityDoneBroadcast) 2333 } 2334 } 2335 } 2336 2337 func (controller *Controller) isStopEstablishing() bool { 2338 select { 2339 case <-controller.establishCtx.Done(): 2340 return true 2341 default: 2342 } 2343 return false 2344 }