github.com/astaguna/popon-core@v0.0.0-20231019235610-96e42d76a5ff/psiphon/tunnel.go (about) 1 /* 2 * Copyright (c) 2015, Psiphon Inc. 3 * All rights reserved. 4 * 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation, either version 3 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package psiphon 21 22 import ( 23 "bytes" 24 "context" 25 "crypto/rand" 26 "encoding/base64" 27 "encoding/json" 28 std_errors "errors" 29 "fmt" 30 "io" 31 "io/ioutil" 32 "net" 33 "net/http" 34 "sync" 35 "sync/atomic" 36 "time" 37 38 "github.com/astaguna/popon-core/psiphon/common" 39 "github.com/astaguna/popon-core/psiphon/common/crypto/ssh" 40 "github.com/astaguna/popon-core/psiphon/common/errors" 41 "github.com/astaguna/popon-core/psiphon/common/obfuscator" 42 "github.com/astaguna/popon-core/psiphon/common/parameters" 43 "github.com/astaguna/popon-core/psiphon/common/prng" 44 "github.com/astaguna/popon-core/psiphon/common/protocol" 45 "github.com/astaguna/popon-core/psiphon/common/quic" 46 "github.com/astaguna/popon-core/psiphon/common/refraction" 47 "github.com/astaguna/popon-core/psiphon/common/tactics" 48 "github.com/astaguna/popon-core/psiphon/transferstats" 49 ) 50 51 // Tunneler specifies the interface required by components that use tunnels. 52 type Tunneler interface { 53 54 // Dial creates a tunneled connection. 55 // 56 // When split tunnel mode is enabled, the connection may be untunneled, 57 // depending on GeoIP classification of the destination. 58 // 59 // downstreamConn is an optional parameter which specifies a connection to be 60 // explicitly closed when the Dialed connection is closed. For instance, this 61 // is used to close downstreamConn App<->LocalProxy connections when the related 62 // LocalProxy<->SshPortForward connections close. 63 Dial(remoteAddr string, downstreamConn net.Conn) (conn net.Conn, err error) 64 65 DirectDial(remoteAddr string) (conn net.Conn, err error) 66 67 SignalComponentFailure() 68 } 69 70 // TunnelOwner specifies the interface required by Tunnel to notify its 71 // owner when it has failed. The owner may, as in the case of the Controller, 72 // remove the tunnel from its list of active tunnels. 73 type TunnelOwner interface { 74 SignalSeededNewSLOK() 75 SignalTunnelFailure(tunnel *Tunnel) 76 } 77 78 // Tunnel is a connection to a Psiphon server. An established 79 // tunnel includes a network connection to the specified server 80 // and an SSH session built on top of that transport. 81 type Tunnel struct { 82 mutex *sync.Mutex 83 config *Config 84 isActivated bool 85 isDiscarded bool 86 isClosed bool 87 dialParams *DialParameters 88 livenessTestMetrics *livenessTestMetrics 89 extraFailureAction func() 90 serverContext *ServerContext 91 monitoringStartTime time.Time 92 conn *common.BurstMonitoredConn 93 sshClient *ssh.Client 94 sshServerRequests <-chan *ssh.Request 95 operateWaitGroup *sync.WaitGroup 96 operateCtx context.Context 97 stopOperate context.CancelFunc 98 signalPortForwardFailure chan struct{} 99 totalPortForwardFailures int 100 adjustedEstablishStartTime time.Time 101 establishDuration time.Duration 102 establishedTime time.Time 103 handledSSHKeepAliveFailure int32 104 inFlightConnectedRequestSignal chan struct{} 105 } 106 107 // getCustomParameters helpers wrap the verbose function call chain required 108 // to get a current snapshot of the parameters.Parameters customized with the 109 // dial parameters associated with a tunnel. 110 111 func (tunnel *Tunnel) getCustomParameters() parameters.ParametersAccessor { 112 return getCustomParameters(tunnel.config, tunnel.dialParams) 113 } 114 115 func getCustomParameters( 116 config *Config, dialParams *DialParameters) parameters.ParametersAccessor { 117 return config.GetParameters().GetCustom(dialParams.NetworkLatencyMultiplier) 118 } 119 120 // ConnectTunnel first makes a network transport connection to the 121 // Psiphon server and then establishes an SSH client session on top of 122 // that transport. The SSH server is authenticated using the public 123 // key in the server entry. 124 // Depending on the server's capabilities, the connection may use 125 // plain SSH over TCP, obfuscated SSH over TCP, or obfuscated SSH over 126 // HTTP (meek protocol). 127 // When requiredProtocol is not blank, that protocol is used. Otherwise, 128 // the a random supported protocol is used. 129 // 130 // Call Activate on a connected tunnel to complete its establishment 131 // before using. 132 // 133 // Tunnel establishment is split into two phases: connection, and 134 // activation. The Controller will run many ConnectTunnel calls 135 // concurrently and then, to avoid unnecessary overhead from making 136 // handshake requests and starting operateTunnel from tunnels which 137 // may be discarded, call Activate on connected tunnels sequentially 138 // as necessary. 139 func ConnectTunnel( 140 ctx context.Context, 141 config *Config, 142 adjustedEstablishStartTime time.Time, 143 dialParams *DialParameters) (*Tunnel, error) { 144 145 // Build transport layers and establish SSH connection. Note that 146 // dialConn and monitoredConn are the same network connection. 147 dialResult, err := dialTunnel(ctx, config, dialParams) 148 if err != nil { 149 return nil, errors.Trace(err) 150 } 151 152 // The tunnel is now connected 153 return &Tunnel{ 154 mutex: new(sync.Mutex), 155 config: config, 156 dialParams: dialParams, 157 livenessTestMetrics: dialResult.livenessTestMetrics, 158 extraFailureAction: dialResult.extraFailureAction, 159 monitoringStartTime: dialResult.monitoringStartTime, 160 conn: dialResult.monitoredConn, 161 sshClient: dialResult.sshClient, 162 sshServerRequests: dialResult.sshRequests, 163 // A buffer allows at least one signal to be sent even when the receiver is 164 // not listening. Senders should not block. 165 signalPortForwardFailure: make(chan struct{}, 1), 166 adjustedEstablishStartTime: adjustedEstablishStartTime, 167 }, nil 168 } 169 170 // Activate completes the tunnel establishment, performing the handshake 171 // request and starting operateTunnel, the worker that monitors the tunnel 172 // and handles periodic management. 173 func (tunnel *Tunnel) Activate( 174 ctx context.Context, tunnelOwner TunnelOwner) (retErr error) { 175 176 // Ensure that, unless the base context is cancelled, any replayed dial 177 // parameters are cleared, no longer to be retried, if the tunnel fails to 178 // activate. 179 activationSucceeded := false 180 baseCtx := ctx 181 defer func() { 182 if !activationSucceeded && baseCtx.Err() != context.Canceled { 183 tunnel.dialParams.Failed(tunnel.config) 184 if tunnel.extraFailureAction != nil { 185 tunnel.extraFailureAction() 186 } 187 if retErr != nil { 188 _ = RecordFailedTunnelStat( 189 tunnel.config, 190 tunnel.dialParams, 191 tunnel.livenessTestMetrics, 192 -1, 193 -1, 194 retErr) 195 } 196 } 197 }() 198 199 // Create a new Psiphon API server context for this tunnel. This includes 200 // performing a handshake request. If the handshake fails, this activation 201 // fails. 202 var serverContext *ServerContext 203 if !tunnel.config.DisableApi { 204 NoticeInfo( 205 "starting server context for %s", 206 tunnel.dialParams.ServerEntry.GetDiagnosticID()) 207 208 // Call NewServerContext in a goroutine, as it blocks on a network operation, 209 // the handshake request, and would block shutdown. If the shutdown signal is 210 // received, close the tunnel, which will interrupt the handshake request 211 // that may be blocking NewServerContext. 212 // 213 // Timeout after PsiphonApiServerTimeoutSeconds. NewServerContext may not 214 // return if the tunnel network connection is unstable during the handshake 215 // request. At this point, there is no operateTunnel monitor that will detect 216 // this condition with SSH keep alives. 217 218 timeout := tunnel.getCustomParameters().Duration( 219 parameters.PsiphonAPIRequestTimeout) 220 221 if timeout > 0 { 222 var cancelFunc context.CancelFunc 223 ctx, cancelFunc = context.WithTimeout(ctx, timeout) 224 defer cancelFunc() 225 } 226 227 type newServerContextResult struct { 228 serverContext *ServerContext 229 err error 230 } 231 232 resultChannel := make(chan newServerContextResult) 233 234 go func() { 235 serverContext, err := NewServerContext(tunnel) 236 resultChannel <- newServerContextResult{ 237 serverContext: serverContext, 238 err: err, 239 } 240 }() 241 242 var result newServerContextResult 243 244 select { 245 case result = <-resultChannel: 246 case <-ctx.Done(): 247 result.err = ctx.Err() 248 // Interrupt the goroutine 249 tunnel.Close(true) 250 <-resultChannel 251 } 252 253 if result.err != nil { 254 return errors.Trace(result.err) 255 } 256 257 serverContext = result.serverContext 258 } 259 260 // The activation succeeded. 261 activationSucceeded = true 262 263 tunnel.mutex.Lock() 264 265 // It may happen that the tunnel gets closed while Activate is running. 266 // In this case, abort here, to ensure that the operateTunnel goroutine 267 // will not be launched after Close is called. 268 if tunnel.isClosed { 269 return errors.TraceNew("tunnel is closed") 270 } 271 272 tunnel.isActivated = true 273 tunnel.serverContext = serverContext 274 275 // establishDuration is the elapsed time between the controller starting tunnel 276 // establishment and this tunnel being established. The reported value represents 277 // how long the user waited between starting the client and having a usable tunnel; 278 // or how long between the client detecting an unexpected tunnel disconnect and 279 // completing automatic reestablishment. 280 // 281 // This time period may include time spent unsuccessfully connecting to other 282 // servers. Time spent waiting for network connectivity is excluded. 283 tunnel.establishDuration = time.Since(tunnel.adjustedEstablishStartTime) 284 tunnel.establishedTime = time.Now() 285 286 // Use the Background context instead of the controller run context, as tunnels 287 // are terminated when the controller calls tunnel.Close. 288 tunnel.operateCtx, tunnel.stopOperate = context.WithCancel(context.Background()) 289 tunnel.operateWaitGroup = new(sync.WaitGroup) 290 291 // Spawn the operateTunnel goroutine, which monitors the tunnel and handles periodic 292 // stats updates. 293 tunnel.operateWaitGroup.Add(1) 294 go tunnel.operateTunnel(tunnelOwner) 295 296 tunnel.mutex.Unlock() 297 298 return nil 299 } 300 301 // Close stops operating the tunnel and closes the underlying connection. 302 // Supports multiple and/or concurrent calls to Close(). 303 // When isDiscarded is set, operateTunnel will not attempt to send final 304 // status requests. 305 func (tunnel *Tunnel) Close(isDiscarded bool) { 306 307 tunnel.mutex.Lock() 308 tunnel.isDiscarded = isDiscarded 309 isActivated := tunnel.isActivated 310 isClosed := tunnel.isClosed 311 tunnel.isClosed = true 312 tunnel.mutex.Unlock() 313 314 if !isClosed { 315 316 // Signal operateTunnel to stop before closing the tunnel -- this 317 // allows a final status request to be made in the case of an orderly 318 // shutdown. 319 // A timer is set, so if operateTunnel takes too long to stop, the 320 // tunnel is closed, which will interrupt any slow final status request. 321 322 if isActivated { 323 timeout := tunnel.getCustomParameters().Duration( 324 parameters.TunnelOperateShutdownTimeout) 325 afterFunc := time.AfterFunc( 326 timeout, 327 func() { tunnel.conn.Close() }) 328 tunnel.stopOperate() 329 tunnel.operateWaitGroup.Wait() 330 afterFunc.Stop() 331 } 332 333 tunnel.sshClient.Close() 334 // tunnel.conn.Close() may get called multiple times, which is allowed. 335 tunnel.conn.Close() 336 337 err := tunnel.sshClient.Wait() 338 if err != nil { 339 NoticeWarning("close tunnel ssh error: %s", err) 340 } 341 } 342 343 // Log burst metrics now that the BurstMonitoredConn is closed. 344 // Metrics will be empty when burst monitoring is disabled. 345 if !isDiscarded && isActivated { 346 burstMetrics := tunnel.conn.GetMetrics(tunnel.monitoringStartTime) 347 if len(burstMetrics) > 0 { 348 NoticeBursts( 349 tunnel.dialParams.ServerEntry.GetDiagnosticID(), 350 burstMetrics) 351 } 352 } 353 } 354 355 // SetInFlightConnectedRequest checks if a connected request can begin and 356 // sets the channel used to signal that the request is complete. 357 // 358 // The caller must not initiate a connected request when 359 // SetInFlightConnectedRequest returns false. When SetInFlightConnectedRequest 360 // returns true, the caller must call SetInFlightConnectedRequest(nil) when 361 // the connected request completes. 362 func (tunnel *Tunnel) SetInFlightConnectedRequest(requestSignal chan struct{}) bool { 363 tunnel.mutex.Lock() 364 defer tunnel.mutex.Unlock() 365 366 // If already closing, don't start a connected request: the 367 // TunnelOperateShutdownTimeout period may be nearly expired. 368 if tunnel.isClosed { 369 return false 370 } 371 372 if requestSignal == nil { 373 // Not already in-flight (not expected) 374 if tunnel.inFlightConnectedRequestSignal == nil { 375 return false 376 } 377 } else { 378 // Already in-flight (not expected) 379 if tunnel.inFlightConnectedRequestSignal != nil { 380 return false 381 } 382 } 383 384 tunnel.inFlightConnectedRequestSignal = requestSignal 385 386 return true 387 } 388 389 // AwaitInFlightConnectedRequest waits for the signal that any in-flight 390 // connected request is complete. 391 // 392 // AwaitInFlightConnectedRequest may block until the connected request is 393 // aborted by terminating the tunnel. 394 func (tunnel *Tunnel) AwaitInFlightConnectedRequest() { 395 tunnel.mutex.Lock() 396 requestSignal := tunnel.inFlightConnectedRequestSignal 397 tunnel.mutex.Unlock() 398 399 if requestSignal != nil { 400 <-requestSignal 401 } 402 } 403 404 // IsActivated returns the tunnel's activated flag. 405 func (tunnel *Tunnel) IsActivated() bool { 406 tunnel.mutex.Lock() 407 defer tunnel.mutex.Unlock() 408 return tunnel.isActivated 409 } 410 411 // IsDiscarded returns the tunnel's discarded flag. 412 func (tunnel *Tunnel) IsDiscarded() bool { 413 tunnel.mutex.Lock() 414 defer tunnel.mutex.Unlock() 415 return tunnel.isDiscarded 416 } 417 418 // SendAPIRequest sends an API request as an SSH request through the tunnel. 419 // This function blocks awaiting a response. Only one request may be in-flight 420 // at once; a concurrent SendAPIRequest will block until an active request 421 // receives its response (or the SSH connection is terminated). 422 func (tunnel *Tunnel) SendAPIRequest( 423 name string, requestPayload []byte) ([]byte, error) { 424 425 ok, responsePayload, err := tunnel.sshClient.Conn.SendRequest( 426 name, true, requestPayload) 427 428 if err != nil { 429 return nil, errors.Trace(err) 430 } 431 432 if !ok { 433 return nil, errors.TraceNew("API request rejected") 434 } 435 436 return responsePayload, nil 437 } 438 439 // DialTCPChannel establishes a TCP port forward connection through the 440 // tunnel. 441 // 442 // When split tunnel mode is enabled, and unless alwaysTunneled is set, the 443 // server may reject the port forward and indicate that the client is to make 444 // direct, untunneled connection. In this case, the bool return value is true 445 // and net.Conn and error are nil. 446 // 447 // downstreamConn is an optional parameter which specifies a connection to be 448 // explicitly closed when the dialed connection is closed. 449 func (tunnel *Tunnel) DialTCPChannel( 450 remoteAddr string, 451 alwaysTunneled bool, 452 downstreamConn net.Conn) (net.Conn, bool, error) { 453 454 channelType := "direct-tcpip" 455 if alwaysTunneled && tunnel.config.IsSplitTunnelEnabled() { 456 // This channel type is only necessary in split tunnel mode. 457 channelType = protocol.TCP_PORT_FORWARD_NO_SPLIT_TUNNEL_TYPE 458 } 459 460 channel, err := tunnel.dialChannel(channelType, remoteAddr) 461 if err != nil { 462 if isSplitTunnelRejectReason(err) { 463 return nil, true, nil 464 } 465 return nil, false, errors.Trace(err) 466 } 467 468 netConn, ok := channel.(net.Conn) 469 if !ok { 470 return nil, false, errors.Tracef("unexpected channel type: %T", channel) 471 } 472 473 conn := &TunneledConn{ 474 Conn: netConn, 475 tunnel: tunnel, 476 downstreamConn: downstreamConn} 477 478 return tunnel.wrapWithTransferStats(conn), false, nil 479 } 480 481 func (tunnel *Tunnel) DialPacketTunnelChannel() (net.Conn, error) { 482 483 channel, err := tunnel.dialChannel(protocol.PACKET_TUNNEL_CHANNEL_TYPE, "") 484 if err != nil { 485 return nil, errors.Trace(err) 486 } 487 488 sshChannel, ok := channel.(ssh.Channel) 489 if !ok { 490 return nil, errors.Tracef("unexpected channel type: %T", channel) 491 } 492 493 NoticeInfo("DialPacketTunnelChannel: established channel") 494 495 conn := newChannelConn(sshChannel) 496 497 // wrapWithTransferStats will track bytes transferred for the 498 // packet tunnel. It will count packet overhead (TCP/UDP/IP headers). 499 // 500 // Since the data in the channel is not HTTP or TLS, no domain bytes 501 // counting is expected. 502 // 503 // transferstats are also used to determine that there's been recent 504 // activity and skip periodic SSH keep alives; see Tunnel.operateTunnel. 505 506 return tunnel.wrapWithTransferStats(conn), nil 507 } 508 509 func (tunnel *Tunnel) dialChannel(channelType, remoteAddr string) (interface{}, error) { 510 511 if !tunnel.IsActivated() { 512 return nil, errors.TraceNew("tunnel is not activated") 513 } 514 515 // Note: there is no dial context since SSH port forward dials cannot 516 // be interrupted directly. Closing the tunnel will interrupt the dials. 517 // A timeout is set to unblock this function, but the goroutine may 518 // not exit until the tunnel is closed. 519 520 type channelDialResult struct { 521 channel interface{} 522 err error 523 } 524 525 // Use a buffer of 1 as there are two senders and only one guaranteed receive. 526 527 results := make(chan *channelDialResult, 1) 528 529 afterFunc := time.AfterFunc( 530 tunnel.getCustomParameters().Duration( 531 parameters.TunnelPortForwardDialTimeout), 532 func() { 533 results <- &channelDialResult{ 534 nil, errors.Tracef("channel dial timeout: %s", channelType)} 535 }) 536 defer afterFunc.Stop() 537 538 go func() { 539 result := new(channelDialResult) 540 switch channelType { 541 542 case "direct-tcpip", protocol.TCP_PORT_FORWARD_NO_SPLIT_TUNNEL_TYPE: 543 // The protocol.TCP_PORT_FORWARD_NO_SPLIT_TUNNEL_TYPE is the same as 544 // "direct-tcpip", except split tunnel channel rejections are disallowed 545 // even when split tunnel mode is enabled. 546 result.channel, result.err = 547 tunnel.sshClient.Dial(channelType, remoteAddr) 548 549 default: 550 var sshRequests <-chan *ssh.Request 551 result.channel, sshRequests, result.err = 552 tunnel.sshClient.OpenChannel(channelType, nil) 553 if result.err == nil { 554 go ssh.DiscardRequests(sshRequests) 555 } 556 } 557 if result.err != nil { 558 result.err = errors.Trace(result.err) 559 } 560 results <- result 561 }() 562 563 result := <-results 564 565 if result.err != nil { 566 if !isSplitTunnelRejectReason(result.err) { 567 select { 568 case tunnel.signalPortForwardFailure <- struct{}{}: 569 default: 570 } 571 } 572 return nil, errors.Trace(result.err) 573 } 574 575 return result.channel, nil 576 } 577 578 func isSplitTunnelRejectReason(err error) bool { 579 580 var openChannelErr *ssh.OpenChannelError 581 if std_errors.As(err, &openChannelErr) { 582 return openChannelErr.Reason == 583 ssh.RejectionReason(protocol.CHANNEL_REJECT_REASON_SPLIT_TUNNEL) 584 } 585 586 return false 587 } 588 589 func (tunnel *Tunnel) wrapWithTransferStats(conn net.Conn) net.Conn { 590 591 // Tunnel does not have a serverContext when DisableApi is set. We still use 592 // transferstats.Conn to count bytes transferred for monitoring tunnel 593 // quality. 594 var regexps *transferstats.Regexps 595 if tunnel.serverContext != nil { 596 regexps = tunnel.serverContext.StatsRegexps() 597 } 598 599 return transferstats.NewConn( 600 conn, tunnel.dialParams.ServerEntry.IpAddress, regexps) 601 } 602 603 // SignalComponentFailure notifies the tunnel that an associated component has failed. 604 // This will terminate the tunnel. 605 func (tunnel *Tunnel) SignalComponentFailure() { 606 NoticeWarning("tunnel received component failure signal") 607 tunnel.Close(false) 608 } 609 610 // TunneledConn implements net.Conn and wraps a port forward connection. 611 // It is used to hook into Read and Write to observe I/O errors and 612 // report these errors back to the tunnel monitor as port forward failures. 613 // TunneledConn optionally tracks a peer connection to be explicitly closed 614 // when the TunneledConn is closed. 615 type TunneledConn struct { 616 net.Conn 617 tunnel *Tunnel 618 downstreamConn net.Conn 619 } 620 621 func (conn *TunneledConn) Read(buffer []byte) (n int, err error) { 622 n, err = conn.Conn.Read(buffer) 623 if err != nil && err != io.EOF { 624 // Report new failure. Won't block; assumes the receiver 625 // has a sufficient buffer for the threshold number of reports. 626 // TODO: conditional on type of error or error message? 627 select { 628 case conn.tunnel.signalPortForwardFailure <- struct{}{}: 629 default: 630 } 631 } 632 return 633 } 634 635 func (conn *TunneledConn) Write(buffer []byte) (n int, err error) { 636 n, err = conn.Conn.Write(buffer) 637 if err != nil && err != io.EOF { 638 // Same as TunneledConn.Read() 639 select { 640 case conn.tunnel.signalPortForwardFailure <- struct{}{}: 641 default: 642 } 643 } 644 return 645 } 646 647 func (conn *TunneledConn) Close() error { 648 if conn.downstreamConn != nil { 649 conn.downstreamConn.Close() 650 } 651 return conn.Conn.Close() 652 } 653 654 type dialResult struct { 655 dialConn net.Conn 656 monitoringStartTime time.Time 657 monitoredConn *common.BurstMonitoredConn 658 sshClient *ssh.Client 659 sshRequests <-chan *ssh.Request 660 livenessTestMetrics *livenessTestMetrics 661 extraFailureAction func() 662 } 663 664 // dialTunnel is a helper that builds the transport layers and establishes the 665 // SSH connection. When additional dial configuration is used, dial metrics 666 // are recorded and returned. 667 func dialTunnel( 668 ctx context.Context, 669 config *Config, 670 dialParams *DialParameters) (_ *dialResult, retErr error) { 671 672 // Return immediately when overall context is canceled or timed-out. This 673 // avoids notice noise. 674 err := ctx.Err() 675 if err != nil { 676 return nil, errors.Trace(err) 677 } 678 679 p := getCustomParameters(config, dialParams) 680 timeout := p.Duration(parameters.TunnelConnectTimeout) 681 rateLimits := p.RateLimits(parameters.TunnelRateLimits) 682 obfuscatedSSHMinPadding := p.Int(parameters.ObfuscatedSSHMinPadding) 683 obfuscatedSSHMaxPadding := p.Int(parameters.ObfuscatedSSHMaxPadding) 684 livenessTestMinUpstreamBytes := p.Int(parameters.LivenessTestMinUpstreamBytes) 685 livenessTestMaxUpstreamBytes := p.Int(parameters.LivenessTestMaxUpstreamBytes) 686 livenessTestMinDownstreamBytes := p.Int(parameters.LivenessTestMinDownstreamBytes) 687 livenessTestMaxDownstreamBytes := p.Int(parameters.LivenessTestMaxDownstreamBytes) 688 burstUpstreamTargetBytes := int64(p.Int(parameters.ClientBurstUpstreamTargetBytes)) 689 burstUpstreamDeadline := p.Duration(parameters.ClientBurstUpstreamDeadline) 690 burstDownstreamTargetBytes := int64(p.Int(parameters.ClientBurstDownstreamTargetBytes)) 691 burstDownstreamDeadline := p.Duration(parameters.ClientBurstDownstreamDeadline) 692 tlsOSSHApplyTrafficShaping := p.WeightedCoinFlip(parameters.TLSTunnelTrafficShapingProbability) 693 tlsOSSHMinTLSPadding := p.Int(parameters.TLSTunnelMinTLSPadding) 694 tlsOSSHMaxTLSPadding := p.Int(parameters.TLSTunnelMaxTLSPadding) 695 p.Close() 696 697 // Ensure that, unless the base context is cancelled, any replayed dial 698 // parameters are cleared, no longer to be retried, if the tunnel fails to 699 // connect. 700 // 701 // Limitation: dials that fail to connect due to the server being in a 702 // load-limiting state are not distinguished and excepted from this 703 // logic. 704 dialSucceeded := false 705 baseCtx := ctx 706 var failedTunnelLivenessTestMetrics *livenessTestMetrics 707 var extraFailureAction func() 708 defer func() { 709 if !dialSucceeded && baseCtx.Err() != context.Canceled { 710 dialParams.Failed(config) 711 if extraFailureAction != nil { 712 extraFailureAction() 713 } 714 if retErr != nil { 715 _ = RecordFailedTunnelStat( 716 config, 717 dialParams, 718 failedTunnelLivenessTestMetrics, 719 -1, 720 -1, 721 retErr) 722 } 723 } 724 }() 725 726 var cancelFunc context.CancelFunc 727 ctx, cancelFunc = context.WithTimeout(ctx, timeout) 728 defer cancelFunc() 729 730 // DialDuration is the elapsed time for both successful and failed tunnel 731 // dials. For successful tunnels, it includes any the network protocol 732 // handshake(s), obfuscation protocol handshake(s), SSH handshake, and 733 // liveness test, when performed. 734 // 735 // Note: ensure DialDuration is set before calling any function which logs 736 // dial_duration. 737 738 startDialTime := time.Now() 739 defer func() { 740 dialParams.DialDuration = time.Since(startDialTime) 741 }() 742 743 // Note: dialParams.MeekResolvedIPAddress isn't set until the dial begins, 744 // so it will always be blank in NoticeConnectingServer. 745 746 NoticeConnectingServer(dialParams) 747 748 // Create the base transport: meek or direct connection 749 750 var dialConn net.Conn 751 752 if protocol.TunnelProtocolUsesTLSOSSH(dialParams.TunnelProtocol) { 753 754 dialConn, err = DialTLSTunnel( 755 ctx, 756 dialParams.GetTLSOSSHConfig(config), 757 dialParams.GetDialConfig(), 758 tlsOSSHApplyTrafficShaping, 759 tlsOSSHMinTLSPadding, 760 tlsOSSHMaxTLSPadding) 761 if err != nil { 762 return nil, errors.Trace(err) 763 } 764 765 } else if protocol.TunnelProtocolUsesMeek(dialParams.TunnelProtocol) { 766 767 dialConn, err = DialMeek( 768 ctx, 769 dialParams.GetMeekConfig(), 770 dialParams.GetDialConfig()) 771 if err != nil { 772 return nil, errors.Trace(err) 773 } 774 775 } else if protocol.TunnelProtocolUsesQUIC(dialParams.TunnelProtocol) { 776 777 packetConn, remoteAddr, err := NewUDPConn( 778 ctx, 779 dialParams.DirectDialAddress, 780 dialParams.GetDialConfig()) 781 if err != nil { 782 return nil, errors.Trace(err) 783 } 784 785 dialConn, err = quic.Dial( 786 ctx, 787 packetConn, 788 remoteAddr, 789 dialParams.QUICDialSNIAddress, 790 dialParams.QUICVersion, 791 dialParams.QUICClientHelloSeed, 792 dialParams.ServerEntry.SshObfuscatedKey, 793 dialParams.ObfuscatedQUICPaddingSeed, 794 dialParams.ObfuscatedQUICNonceTransformerParameters, 795 dialParams.QUICDisablePathMTUDiscovery) 796 if err != nil { 797 return nil, errors.Trace(err) 798 } 799 800 } else if protocol.TunnelProtocolUsesTapDance(dialParams.TunnelProtocol) { 801 802 dialConn, err = refraction.DialTapDance( 803 ctx, 804 config.EmitRefractionNetworkingLogs, 805 config.GetPsiphonDataDirectory(), 806 NewNetDialer(dialParams.GetDialConfig()), 807 dialParams.DirectDialAddress) 808 if err != nil { 809 return nil, errors.Trace(err) 810 } 811 812 } else if protocol.TunnelProtocolUsesConjure(dialParams.TunnelProtocol) { 813 814 // Specify a cache key with a scope that ensures that: 815 // 816 // (a) cached registrations aren't used across different networks, as a 817 // registration requires the client's public IP to match the value at time 818 // of registration; 819 // 820 // (b) cached registrations are associated with specific Psiphon server 821 // candidates, to ensure that replay will use the same phantom IP(s). 822 // 823 // This scheme allows for reuse of cached registrations on network A when a 824 // client roams from network A to network B and back to network A. 825 // 826 // Using the network ID as a proxy for client public IP address is a 827 // heurisitic: it's possible that a clients public IP address changes 828 // without the network ID changing, and it's not guaranteed that the client 829 // will be assigned the original public IP on network A; so there's some 830 // chance the registration cannot be reused. 831 832 diagnosticID := dialParams.ServerEntry.GetDiagnosticID() 833 834 cacheKey := dialParams.NetworkID + "-" + diagnosticID 835 836 conjureConfig := &refraction.ConjureConfig{ 837 RegistrationCacheTTL: dialParams.ConjureCachedRegistrationTTL, 838 RegistrationCacheKey: cacheKey, 839 Transport: dialParams.ConjureTransport, 840 DiagnosticID: diagnosticID, 841 Logger: NoticeCommonLogger(), 842 } 843 844 // Set extraFailureAction, which is invoked whenever the tunnel fails (i.e., 845 // where RecordFailedTunnelStat is invoked). The action will remove any 846 // cached registration. When refraction.DialConjure succeeds, the underlying 847 // registration is cached. After refraction.DialConjure returns, it no 848 // longer modifies the cached state of that registration, assuming that it 849 // remains valid and effective. However adversarial impact on a given 850 // phantom IP may not become evident until after the initial TCP connection 851 // establishment and handshake performed by refraction.DialConjure. For 852 // example, it may be that the phantom dial is targeted for severe 853 // throttling which begins or is only evident later in the flow. Scheduling 854 // a call to DeleteCachedConjureRegistration allows us to invalidate the 855 // cached registration for a tunnel that fails later in its lifecycle. 856 // 857 // Note that extraFailureAction will retain a reference to conjureConfig for 858 // the lifetime of the tunnel. 859 extraFailureAction = func() { 860 refraction.DeleteCachedConjureRegistration(conjureConfig) 861 } 862 863 if dialParams.ConjureAPIRegistration { 864 865 // Use MeekConn to domain front Conjure API registration. 866 // 867 // ConjureAPIRegistrarFrontingSpecs are applied via 868 // dialParams.GetMeekConfig, and will be subject to replay. 869 // 870 // Since DialMeek will create a TLS connection immediately, and a cached 871 // registration may be used, we will delay initializing the MeekConn-based 872 // RoundTripper until we know it's needed. This is implemented by passing 873 // in a RoundTripper that establishes a MeekConn when RoundTrip is called. 874 // 875 // In refraction.dial we configure 0 retries for API registration requests, 876 // assuming it's better to let another Psiphon candidate retry, with new 877 // domaing fronting parameters. As such, we expect only one round trip call 878 // per NewHTTPRoundTripper, so, in practise, there's no performance penalty 879 // from establishing a new MeekConn per round trip. 880 // 881 // Performing the full DialMeek/RoundTrip operation here allows us to call 882 // MeekConn.Close and ensure all resources are immediately cleaned up. 883 roundTrip := func(request *http.Request) (*http.Response, error) { 884 885 conn, err := DialMeek( 886 ctx, dialParams.GetMeekConfig(), dialParams.GetDialConfig()) 887 if err != nil { 888 return nil, errors.Trace(err) 889 } 890 defer conn.Close() 891 892 response, err := conn.RoundTrip(request) 893 if err != nil { 894 return nil, errors.Trace(err) 895 } 896 897 // Read the response into a buffer and close the response 898 // body, ensuring that MeekConn.Close closes all idle connections. 899 // 900 // Alternatively, we could Clone the request to set 901 // http.Request.Close and avoid keeping any idle connection 902 // open after the response body is read by gotapdance. Since 903 // the response body is small and since gotapdance does not 904 // stream the response body, we're taking this approach which 905 // ensures cleanup. 906 907 body, err := ioutil.ReadAll(response.Body) 908 _ = response.Body.Close() 909 if err != nil { 910 return nil, errors.Trace(err) 911 } 912 response.Body = io.NopCloser(bytes.NewReader(body)) 913 914 return response, nil 915 } 916 917 conjureConfig.APIRegistrarHTTPClient = &http.Client{ 918 Transport: common.NewHTTPRoundTripper(roundTrip), 919 } 920 921 conjureConfig.APIRegistrarBidirectionalURL = 922 dialParams.ConjureAPIRegistrarBidirectionalURL 923 conjureConfig.APIRegistrarDelay = dialParams.ConjureAPIRegistrarDelay 924 925 } else if dialParams.ConjureDecoyRegistration { 926 927 // The Conjure "phantom" connection is compatible with fragmentation, but 928 // the decoy registrar connection, like Tapdance, is not, so force it off. 929 // Any tunnel fragmentation metrics will refer to the "phantom" connection 930 // only. 931 conjureConfig.DecoyRegistrarDialer = NewNetDialer( 932 dialParams.GetDialConfig().WithoutFragmentor()) 933 conjureConfig.DecoyRegistrarWidth = dialParams.ConjureDecoyRegistrarWidth 934 conjureConfig.DecoyRegistrarDelay = dialParams.ConjureDecoyRegistrarDelay 935 } 936 937 dialConn, err = refraction.DialConjure( 938 ctx, 939 config.EmitRefractionNetworkingLogs, 940 config.GetPsiphonDataDirectory(), 941 NewNetDialer(dialParams.GetDialConfig()), 942 dialParams.DirectDialAddress, 943 conjureConfig) 944 if err != nil { 945 return nil, errors.Trace(err) 946 } 947 948 } else { 949 950 dialConn, err = DialTCP( 951 ctx, 952 dialParams.DirectDialAddress, 953 dialParams.GetDialConfig()) 954 if err != nil { 955 return nil, errors.Trace(err) 956 } 957 } 958 959 // Some conns report additional metrics. fragmentor.Conns report 960 // fragmentor configs. 961 if metricsSource, ok := dialConn.(common.MetricsSource); ok { 962 dialParams.DialConnMetrics = metricsSource 963 } 964 965 if noticeMetricsSource, ok := dialConn.(common.NoticeMetricsSource); ok { 966 dialParams.DialConnNoticeMetrics = noticeMetricsSource 967 } 968 969 // If dialConn is not a Closer, tunnel failure detection may be slower 970 if _, ok := dialConn.(common.Closer); !ok { 971 NoticeWarning("tunnel.dialTunnel: dialConn is not a Closer") 972 } 973 974 cleanupConn := dialConn 975 defer func() { 976 // Cleanup on error 977 if cleanupConn != nil { 978 cleanupConn.Close() 979 } 980 }() 981 982 monitoringStartTime := time.Now() 983 monitoredConn := common.NewBurstMonitoredConn( 984 dialConn, 985 false, 986 burstUpstreamTargetBytes, burstUpstreamDeadline, 987 burstDownstreamTargetBytes, burstDownstreamDeadline) 988 989 // Apply throttling (if configured) 990 throttledConn := common.NewThrottledConn( 991 monitoredConn, 992 rateLimits) 993 994 // Add obfuscated SSH layer 995 var sshConn net.Conn = throttledConn 996 if protocol.TunnelProtocolUsesObfuscatedSSH(dialParams.TunnelProtocol) { 997 obfuscatedSSHConn, err := obfuscator.NewClientObfuscatedSSHConn( 998 throttledConn, 999 dialParams.ServerEntry.SshObfuscatedKey, 1000 dialParams.ObfuscatorPaddingSeed, 1001 dialParams.OSSHObfuscatorSeedTransformerParameters, 1002 dialParams.OSSHPrefixSpec, 1003 dialParams.OSSHPrefixSplitConfig, 1004 &obfuscatedSSHMinPadding, 1005 &obfuscatedSSHMaxPadding) 1006 if err != nil { 1007 return nil, errors.Trace(err) 1008 } 1009 sshConn = obfuscatedSSHConn 1010 dialParams.ObfuscatedSSHConnMetrics = obfuscatedSSHConn 1011 } 1012 1013 // Now establish the SSH session over the conn transport 1014 expectedPublicKey, err := base64.StdEncoding.DecodeString( 1015 dialParams.ServerEntry.SshHostKey) 1016 if err != nil { 1017 return nil, errors.Trace(err) 1018 } 1019 sshCertChecker := &ssh.CertChecker{ 1020 IsHostAuthority: func(auth ssh.PublicKey, address string) bool { 1021 // Psiphon servers do not currently use SSH certificates. This CertChecker 1022 // code path may still be hit if a client attempts to connect using an 1023 // obsolete server entry. 1024 return false 1025 }, 1026 HostKeyFallback: func(addr string, remote net.Addr, publicKey ssh.PublicKey) error { 1027 if !bytes.Equal(expectedPublicKey, publicKey.Marshal()) { 1028 return errors.TraceNew("unexpected host public key") 1029 } 1030 return nil 1031 }, 1032 } 1033 1034 sshPasswordPayload := &protocol.SSHPasswordPayload{ 1035 SessionId: config.SessionID, 1036 SshPassword: dialParams.ServerEntry.SshPassword, 1037 ClientCapabilities: []string{protocol.CLIENT_CAPABILITY_SERVER_REQUESTS}, 1038 } 1039 1040 payload, err := json.Marshal(sshPasswordPayload) 1041 if err != nil { 1042 return nil, errors.Trace(err) 1043 } 1044 1045 sshClientConfig := &ssh.ClientConfig{ 1046 User: dialParams.ServerEntry.SshUsername, 1047 Auth: []ssh.AuthMethod{ 1048 ssh.Password(string(payload)), 1049 }, 1050 HostKeyCallback: sshCertChecker.CheckHostKey, 1051 ClientVersion: dialParams.SSHClientVersion, 1052 } 1053 1054 sshClientConfig.KEXPRNGSeed = dialParams.SSHKEXSeed 1055 1056 if protocol.TunnelProtocolUsesObfuscatedSSH(dialParams.TunnelProtocol) { 1057 if config.ObfuscatedSSHAlgorithms != nil { 1058 sshClientConfig.KeyExchanges = []string{config.ObfuscatedSSHAlgorithms[0]} 1059 sshClientConfig.Ciphers = []string{config.ObfuscatedSSHAlgorithms[1]} 1060 sshClientConfig.MACs = []string{config.ObfuscatedSSHAlgorithms[2]} 1061 sshClientConfig.HostKeyAlgorithms = []string{config.ObfuscatedSSHAlgorithms[3]} 1062 1063 } else { 1064 // With Encrypt-then-MAC hash algorithms, packet length is 1065 // transmitted in plaintext, which aids in traffic analysis. 1066 // 1067 // TUNNEL_PROTOCOL_SSH is excepted since its KEX appears in plaintext, 1068 // and the protocol is intended to look like SSH on the wire. 1069 sshClientConfig.NoEncryptThenMACHash = true 1070 } 1071 } else { 1072 // For TUNNEL_PROTOCOL_SSH only, the server is expected to randomize 1073 // its KEX; setting PeerKEXPRNGSeed will ensure successful negotiation 1074 // betweem two randomized KEXes. 1075 if dialParams.ServerEntry.SshObfuscatedKey != "" { 1076 sshClientConfig.PeerKEXPRNGSeed, err = protocol.DeriveSSHServerKEXPRNGSeed( 1077 dialParams.ServerEntry.SshObfuscatedKey) 1078 if err != nil { 1079 return nil, errors.Trace(err) 1080 } 1081 } 1082 } 1083 1084 // The ssh session establishment (via ssh.NewClientConn) is wrapped 1085 // in a timeout to ensure it won't hang. We've encountered firewalls 1086 // that allow the TCP handshake to complete but then send a RST to the 1087 // server-side and nothing to the client-side, and if that happens 1088 // while ssh.NewClientConn is reading, it may wait forever. The timeout 1089 // closes the conn, which interrupts it. 1090 // Note: TCP handshake timeouts are provided by TCPConn, and session 1091 // timeouts *after* ssh establishment are provided by the ssh keep alive 1092 // in operate tunnel. 1093 1094 type sshNewClientResult struct { 1095 sshClient *ssh.Client 1096 sshRequests <-chan *ssh.Request 1097 livenessTestMetrics *livenessTestMetrics 1098 err error 1099 } 1100 1101 resultChannel := make(chan sshNewClientResult) 1102 1103 // Call NewClientConn in a goroutine, as it blocks on SSH handshake network 1104 // operations, and would block canceling or shutdown. If the parent context 1105 // is canceled, close the net.Conn underlying SSH, which will interrupt the 1106 // SSH handshake that may be blocking NewClientConn. 1107 1108 go func() { 1109 // The following is adapted from ssh.Dial(), here using a custom conn 1110 // The sshAddress is passed through to host key verification callbacks; we don't use it. 1111 sshAddress := "" 1112 sshClientConn, sshChannels, sshRequests, err := ssh.NewClientConn( 1113 sshConn, sshAddress, sshClientConfig) 1114 1115 var sshClient *ssh.Client 1116 var metrics *livenessTestMetrics 1117 1118 if err == nil { 1119 1120 // sshRequests is handled by operateTunnel. 1121 // ssh.NewClient also expects to handle the sshRequests 1122 // value from ssh.NewClientConn and will spawn a goroutine 1123 // to handle the <-chan *ssh.Request, so we must provide 1124 // a closed channel to ensure that goroutine halts instead 1125 // of hanging on a nil channel. 1126 noRequests := make(chan *ssh.Request) 1127 close(noRequests) 1128 1129 sshClient = ssh.NewClient(sshClientConn, sshChannels, noRequests) 1130 1131 if livenessTestMaxUpstreamBytes > 0 || livenessTestMaxDownstreamBytes > 0 { 1132 1133 // When configured, perform a liveness test which sends and 1134 // receives bytes through the tunnel to ensure the tunnel had 1135 // not been blocked upon or shortly after connecting. This 1136 // test is performed concurrently for each establishment 1137 // candidate before selecting a successful tunnel. 1138 // 1139 // Note that the liveness test is subject to the 1140 // TunnelConnectTimeout, which should be adjusted 1141 // accordinging. 1142 1143 metrics, err = performLivenessTest( 1144 sshClient, 1145 livenessTestMinUpstreamBytes, livenessTestMaxUpstreamBytes, 1146 livenessTestMinDownstreamBytes, livenessTestMaxDownstreamBytes, 1147 dialParams.LivenessTestSeed) 1148 1149 // Skip notice when cancelling. 1150 if baseCtx.Err() == nil { 1151 NoticeLivenessTest( 1152 dialParams.ServerEntry.GetDiagnosticID(), metrics, err == nil) 1153 } 1154 } 1155 } 1156 1157 resultChannel <- sshNewClientResult{sshClient, sshRequests, metrics, err} 1158 }() 1159 1160 var result sshNewClientResult 1161 1162 select { 1163 case result = <-resultChannel: 1164 case <-ctx.Done(): 1165 1166 // Interrupt the goroutine and capture its error context to 1167 // distinguish point of failure. 1168 err := ctx.Err() 1169 sshConn.Close() 1170 result = <-resultChannel 1171 if result.err != nil { 1172 result.err = fmt.Errorf("%s: %s", err, result.err) 1173 } else { 1174 result.err = err 1175 } 1176 } 1177 1178 if result.err != nil { 1179 failedTunnelLivenessTestMetrics = result.livenessTestMetrics 1180 return nil, errors.Trace(result.err) 1181 } 1182 1183 dialSucceeded = true 1184 1185 NoticeConnectedServer(dialParams) 1186 1187 cleanupConn = nil 1188 1189 // Invoke DNS cache extension (if enabled in the resolver) now that the 1190 // tunnel is connected and the Psiphon server is authenticated. This 1191 // demonstrates that any domain name resolved to an endpoint that is or 1192 // is forwarded to the expected Psiphon server. 1193 // 1194 // Limitation: DNS cache extension is not implemented for Refraction 1195 // Networking protocols. iOS VPN, the primary use case for DNS cache 1196 // extension, does not enable Refraction Networking. 1197 if protocol.TunnelProtocolUsesFrontedMeek(dialParams.TunnelProtocol) { 1198 resolver := config.GetResolver() 1199 if resolver != nil { 1200 resolver.VerifyCacheExtension(dialParams.MeekFrontingDialAddress) 1201 } 1202 } 1203 1204 // When configured to do so, hold-off on activating this tunnel. This allows 1205 // some extra time for slower but less resource intensive protocols to 1206 // establish tunnels. By holding off post-connect, the client has this 1207 // established tunnel ready to activate in case other protocols fail to 1208 // establish. This hold-off phase continues to consume one connection worker. 1209 // 1210 // The network latency multiplier is not applied to HoldOffTunnelDuration, 1211 // as the goal is to apply a consistent hold-off range across all tunnel 1212 // candidates; and this avoids scaling up any delay users experience. 1213 // 1214 // The hold-off is applied regardless of whether this is the first tunnel 1215 // in a session or a reconnection, even to a server affinity candidate, 1216 // so that the advantage for other protocols persists. 1217 1218 if dialParams.HoldOffTunnelDuration > 0 { 1219 NoticeHoldOffTunnel(dialParams.ServerEntry.GetDiagnosticID(), dialParams.HoldOffTunnelDuration) 1220 common.SleepWithContext(ctx, dialParams.HoldOffTunnelDuration) 1221 } 1222 1223 // Note: dialConn may be used to close the underlying network connection 1224 // but should not be used to perform I/O as that would interfere with SSH 1225 // (and also bypasses throttling). 1226 1227 return &dialResult{ 1228 dialConn: dialConn, 1229 monitoringStartTime: monitoringStartTime, 1230 monitoredConn: monitoredConn, 1231 sshClient: result.sshClient, 1232 sshRequests: result.sshRequests, 1233 livenessTestMetrics: result.livenessTestMetrics, 1234 extraFailureAction: extraFailureAction, 1235 }, 1236 nil 1237 } 1238 1239 // Fields are exported for JSON encoding in NoticeLivenessTest. 1240 type livenessTestMetrics struct { 1241 Duration string 1242 UpstreamBytes int 1243 SentUpstreamBytes int 1244 DownstreamBytes int 1245 ReceivedDownstreamBytes int 1246 } 1247 1248 func performLivenessTest( 1249 sshClient *ssh.Client, 1250 minUpstreamBytes, maxUpstreamBytes int, 1251 minDownstreamBytes, maxDownstreamBytes int, 1252 livenessTestPRNGSeed *prng.Seed) (*livenessTestMetrics, error) { 1253 1254 metrics := new(livenessTestMetrics) 1255 1256 defer func(startTime time.Time) { 1257 metrics.Duration = time.Since(startTime).String() 1258 }(time.Now()) 1259 1260 PRNG := prng.NewPRNGWithSeed(livenessTestPRNGSeed) 1261 1262 metrics.UpstreamBytes = PRNG.Range(minUpstreamBytes, maxUpstreamBytes) 1263 metrics.DownstreamBytes = PRNG.Range(minDownstreamBytes, maxDownstreamBytes) 1264 1265 request := &protocol.RandomStreamRequest{ 1266 UpstreamBytes: metrics.UpstreamBytes, 1267 DownstreamBytes: metrics.DownstreamBytes, 1268 } 1269 1270 extraData, err := json.Marshal(request) 1271 if err != nil { 1272 return metrics, errors.Trace(err) 1273 } 1274 1275 channel, requests, err := sshClient.OpenChannel( 1276 protocol.RANDOM_STREAM_CHANNEL_TYPE, extraData) 1277 if err != nil { 1278 return metrics, errors.Trace(err) 1279 } 1280 defer channel.Close() 1281 1282 go ssh.DiscardRequests(requests) 1283 1284 sent := 0 1285 received := 0 1286 upstream := new(sync.WaitGroup) 1287 var errUpstream, errDownstream error 1288 1289 if metrics.UpstreamBytes > 0 { 1290 1291 // Process streams concurrently to minimize elapsed time. This also 1292 // avoids a unidirectional flow burst early in the tunnel lifecycle. 1293 1294 upstream.Add(1) 1295 go func() { 1296 defer upstream.Done() 1297 1298 // In consideration of memory-constrained environments, use modest-sized copy 1299 // buffers since many tunnel establishment workers may run the liveness test 1300 // concurrently. 1301 var buffer [4096]byte 1302 1303 n, err := common.CopyNBuffer(channel, rand.Reader, int64(metrics.UpstreamBytes), buffer[:]) 1304 sent = int(n) 1305 if err != nil { 1306 errUpstream = errors.Trace(err) 1307 } 1308 }() 1309 } 1310 1311 if metrics.DownstreamBytes > 0 { 1312 var buffer [4096]byte 1313 n, err := common.CopyNBuffer(ioutil.Discard, channel, int64(metrics.DownstreamBytes), buffer[:]) 1314 received = int(n) 1315 if err != nil { 1316 errDownstream = errors.Trace(err) 1317 } 1318 } 1319 1320 upstream.Wait() 1321 metrics.SentUpstreamBytes = sent 1322 metrics.ReceivedDownstreamBytes = received 1323 1324 if errUpstream != nil { 1325 return metrics, errUpstream 1326 } else if errDownstream != nil { 1327 return metrics, errDownstream 1328 } 1329 1330 return metrics, nil 1331 } 1332 1333 // operateTunnel monitors the health of the tunnel and performs 1334 // periodic work. 1335 // 1336 // BytesTransferred and TotalBytesTransferred notices are emitted 1337 // for live reporting and diagnostics reporting, respectively. 1338 // 1339 // Status requests are sent to the Psiphon API to report bytes 1340 // transferred. 1341 // 1342 // Periodic SSH keep alive packets are sent to ensure the underlying 1343 // TCP connection isn't terminated by NAT, or other network 1344 // interference -- or test if it has been terminated while the device 1345 // has been asleep. When a keep alive times out, the tunnel is 1346 // considered failed. 1347 // 1348 // An immediate SSH keep alive "probe" is sent to test the tunnel and 1349 // server responsiveness when a port forward failure is detected: a 1350 // failed dial or failed read/write. This keep alive has a shorter 1351 // timeout. 1352 // 1353 // Note that port forward failures may be due to non-failure conditions. 1354 // For example, when the user inputs an invalid domain name and 1355 // resolution is done by the ssh server; or trying to connect to a 1356 // non-white-listed port; and the error message in these cases is not 1357 // distinguishable from a a true server error (a common error message, 1358 // "ssh: rejected: administratively prohibited (open failed)", may be 1359 // returned for these cases but also if the server has run out of 1360 // ephemeral ports, for example). 1361 // 1362 // SSH keep alives are not sent when the tunnel has been recently 1363 // active (not only does tunnel activity obviate the necessity of a keep 1364 // alive, testing has shown that keep alives may time out for "busy" 1365 // tunnels, especially over meek protocol and other high latency 1366 // conditions). 1367 // 1368 // "Recently active" is defined has having received payload bytes. Sent 1369 // bytes are not considered as testing has shown bytes may appear to 1370 // send when certain NAT devices have interfered with the tunnel, while 1371 // no bytes are received. In a pathological case, with DNS implemented 1372 // as tunneled UDP, a browser may wait excessively for a domain name to 1373 // resolve, while no new port forward is attempted which would otherwise 1374 // result in a tunnel failure detection. 1375 // 1376 // TODO: change "recently active" to include having received any 1377 // SSH protocol messages from the server, not just user payload? 1378 func (tunnel *Tunnel) operateTunnel(tunnelOwner TunnelOwner) { 1379 defer tunnel.operateWaitGroup.Done() 1380 1381 now := time.Now() 1382 lastBytesReceivedTime := now 1383 lastTotalBytesTransferedTime := now 1384 totalSent := int64(0) 1385 totalReceived := int64(0) 1386 setDialParamsSucceeded := false 1387 1388 noticeBytesTransferredTicker := time.NewTicker(1 * time.Second) 1389 defer noticeBytesTransferredTicker.Stop() 1390 1391 // The next status request and ssh keep alive times are picked at random, 1392 // from a range, to make the resulting traffic less fingerprintable, 1393 // Note: not using Tickers since these are not fixed time periods. 1394 nextStatusRequestPeriod := func() time.Duration { 1395 p := tunnel.getCustomParameters() 1396 return prng.Period( 1397 p.Duration(parameters.PsiphonAPIStatusRequestPeriodMin), 1398 p.Duration(parameters.PsiphonAPIStatusRequestPeriodMax)) 1399 } 1400 1401 statsTimer := time.NewTimer(nextStatusRequestPeriod()) 1402 defer statsTimer.Stop() 1403 1404 // Schedule an almost-immediate status request to deliver any unreported 1405 // persistent stats. 1406 unreported := CountUnreportedPersistentStats() 1407 if unreported > 0 { 1408 NoticeInfo("Unreported persistent stats: %d", unreported) 1409 p := tunnel.getCustomParameters() 1410 statsTimer.Reset( 1411 prng.Period( 1412 p.Duration(parameters.PsiphonAPIStatusRequestShortPeriodMin), 1413 p.Duration(parameters.PsiphonAPIStatusRequestShortPeriodMax))) 1414 } 1415 1416 nextSshKeepAlivePeriod := func() time.Duration { 1417 p := tunnel.getCustomParameters() 1418 return prng.Period( 1419 p.Duration(parameters.SSHKeepAlivePeriodMin), 1420 p.Duration(parameters.SSHKeepAlivePeriodMax)) 1421 } 1422 1423 // TODO: don't initialize timer when config.DisablePeriodicSshKeepAlive is set 1424 sshKeepAliveTimer := time.NewTimer(nextSshKeepAlivePeriod()) 1425 if tunnel.config.DisablePeriodicSshKeepAlive { 1426 sshKeepAliveTimer.Stop() 1427 } else { 1428 defer sshKeepAliveTimer.Stop() 1429 } 1430 1431 // Perform network requests in separate goroutines so as not to block 1432 // other operations. 1433 requestsWaitGroup := new(sync.WaitGroup) 1434 1435 requestsWaitGroup.Add(1) 1436 signalStatusRequest := make(chan struct{}) 1437 go func() { 1438 defer requestsWaitGroup.Done() 1439 for range signalStatusRequest { 1440 sendStats(tunnel) 1441 } 1442 }() 1443 1444 requestsWaitGroup.Add(1) 1445 signalPeriodicSshKeepAlive := make(chan time.Duration) 1446 sshKeepAliveError := make(chan error, 1) 1447 go func() { 1448 defer requestsWaitGroup.Done() 1449 isFirstPeriodicKeepAlive := true 1450 for timeout := range signalPeriodicSshKeepAlive { 1451 bytesUp := atomic.LoadInt64(&totalSent) 1452 bytesDown := atomic.LoadInt64(&totalReceived) 1453 err := tunnel.sendSshKeepAlive( 1454 isFirstPeriodicKeepAlive, false, timeout, bytesUp, bytesDown) 1455 if err != nil { 1456 select { 1457 case sshKeepAliveError <- err: 1458 default: 1459 } 1460 } 1461 isFirstPeriodicKeepAlive = false 1462 } 1463 }() 1464 1465 // Probe-type SSH keep alives have a distinct send worker and may be sent 1466 // concurrently, to ensure a long period keep alive timeout doesn't delay 1467 // failed tunnel detection. 1468 1469 requestsWaitGroup.Add(1) 1470 signalProbeSshKeepAlive := make(chan time.Duration) 1471 go func() { 1472 defer requestsWaitGroup.Done() 1473 for timeout := range signalProbeSshKeepAlive { 1474 bytesUp := atomic.LoadInt64(&totalSent) 1475 bytesDown := atomic.LoadInt64(&totalReceived) 1476 err := tunnel.sendSshKeepAlive( 1477 false, true, timeout, bytesUp, bytesDown) 1478 if err != nil { 1479 select { 1480 case sshKeepAliveError <- err: 1481 default: 1482 } 1483 } 1484 } 1485 }() 1486 1487 shutdown := false 1488 var err error 1489 for !shutdown && err == nil { 1490 select { 1491 case <-noticeBytesTransferredTicker.C: 1492 sent, received := transferstats.ReportRecentBytesTransferredForServer( 1493 tunnel.dialParams.ServerEntry.IpAddress) 1494 1495 if received > 0 { 1496 lastBytesReceivedTime = time.Now() 1497 } 1498 1499 bytesUp := atomic.AddInt64(&totalSent, sent) 1500 bytesDown := atomic.AddInt64(&totalReceived, received) 1501 1502 p := tunnel.getCustomParameters() 1503 noticePeriod := p.Duration(parameters.TotalBytesTransferredNoticePeriod) 1504 doEmitMemoryMetrics := p.Bool(parameters.TotalBytesTransferredEmitMemoryMetrics) 1505 replayTargetUpstreamBytes := p.Int(parameters.ReplayTargetUpstreamBytes) 1506 replayTargetDownstreamBytes := p.Int(parameters.ReplayTargetDownstreamBytes) 1507 replayTargetTunnelDuration := p.Duration(parameters.ReplayTargetTunnelDuration) 1508 1509 if lastTotalBytesTransferedTime.Add(noticePeriod).Before(time.Now()) { 1510 NoticeTotalBytesTransferred( 1511 tunnel.dialParams.ServerEntry.GetDiagnosticID(), bytesUp, bytesDown) 1512 if doEmitMemoryMetrics { 1513 emitMemoryMetrics() 1514 } 1515 lastTotalBytesTransferedTime = time.Now() 1516 } 1517 1518 // Only emit the frequent BytesTransferred notice when tunnel is not idle. 1519 if tunnel.config.EmitBytesTransferred && (sent > 0 || received > 0) { 1520 NoticeBytesTransferred( 1521 tunnel.dialParams.ServerEntry.GetDiagnosticID(), sent, received) 1522 } 1523 1524 // Once the tunnel has connected, activated, successfully transmitted the 1525 // targeted number of bytes, and been up for the targeted duration 1526 // (measured from the end of establishment), store its dial parameters for 1527 // subsequent replay. 1528 // 1529 // Even when target bytes and duration are all 0, the tunnel must remain up 1530 // for at least 1 second due to use of noticeBytesTransferredTicker; for 1531 // the same reason the granularity of ReplayTargetTunnelDuration is 1532 // seconds. 1533 if !setDialParamsSucceeded && 1534 bytesUp >= int64(replayTargetUpstreamBytes) && 1535 bytesDown >= int64(replayTargetDownstreamBytes) && 1536 time.Since(tunnel.establishedTime) >= replayTargetTunnelDuration { 1537 1538 tunnel.dialParams.Succeeded() 1539 setDialParamsSucceeded = true 1540 } 1541 1542 case <-statsTimer.C: 1543 select { 1544 case signalStatusRequest <- struct{}{}: 1545 default: 1546 } 1547 statsTimer.Reset(nextStatusRequestPeriod()) 1548 1549 case <-sshKeepAliveTimer.C: 1550 p := tunnel.getCustomParameters() 1551 inactivePeriod := p.Duration(parameters.SSHKeepAlivePeriodicInactivePeriod) 1552 if lastBytesReceivedTime.Add(inactivePeriod).Before(time.Now()) { 1553 timeout := p.Duration(parameters.SSHKeepAlivePeriodicTimeout) 1554 select { 1555 case signalPeriodicSshKeepAlive <- timeout: 1556 default: 1557 } 1558 } 1559 sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod()) 1560 1561 case <-tunnel.signalPortForwardFailure: 1562 // Note: no mutex on portForwardFailureTotal; only referenced here 1563 tunnel.totalPortForwardFailures++ 1564 NoticeInfo("port forward failures for %s: %d", 1565 tunnel.dialParams.ServerEntry.GetDiagnosticID(), 1566 tunnel.totalPortForwardFailures) 1567 1568 // If the underlying Conn has closed (meek and other plugin protocols may 1569 // close themselves in certain error conditions), the tunnel has certainly 1570 // failed. Otherwise, probe with an SSH keep alive. 1571 // 1572 // TODO: the IsClosed case omits the failed tunnel logging and reset 1573 // actions performed by sendSshKeepAlive. Should self-closing protocols 1574 // perform these actions themselves? 1575 1576 if tunnel.conn.IsClosed() { 1577 err = errors.TraceNew("underlying conn is closed") 1578 } else { 1579 p := tunnel.getCustomParameters() 1580 inactivePeriod := p.Duration(parameters.SSHKeepAliveProbeInactivePeriod) 1581 if lastBytesReceivedTime.Add(inactivePeriod).Before(time.Now()) { 1582 timeout := p.Duration(parameters.SSHKeepAliveProbeTimeout) 1583 select { 1584 case signalProbeSshKeepAlive <- timeout: 1585 default: 1586 } 1587 } 1588 if !tunnel.config.DisablePeriodicSshKeepAlive { 1589 sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod()) 1590 } 1591 1592 } 1593 1594 case err = <-sshKeepAliveError: 1595 1596 case serverRequest := <-tunnel.sshServerRequests: 1597 if serverRequest != nil { 1598 err := HandleServerRequest(tunnelOwner, tunnel, serverRequest.Type, serverRequest.Payload) 1599 if err == nil { 1600 serverRequest.Reply(true, nil) 1601 } else { 1602 NoticeWarning("HandleServerRequest for %s failed: %s", serverRequest.Type, err) 1603 serverRequest.Reply(false, nil) 1604 1605 } 1606 } 1607 1608 case <-tunnel.operateCtx.Done(): 1609 shutdown = true 1610 } 1611 } 1612 1613 close(signalPeriodicSshKeepAlive) 1614 close(signalProbeSshKeepAlive) 1615 close(signalStatusRequest) 1616 requestsWaitGroup.Wait() 1617 1618 // Capture bytes transferred since the last noticeBytesTransferredTicker tick 1619 sent, received := transferstats.ReportRecentBytesTransferredForServer( 1620 tunnel.dialParams.ServerEntry.IpAddress) 1621 bytesUp := atomic.AddInt64(&totalSent, sent) 1622 bytesDown := atomic.AddInt64(&totalReceived, received) 1623 1624 // Always emit a final NoticeTotalBytesTransferred 1625 NoticeTotalBytesTransferred( 1626 tunnel.dialParams.ServerEntry.GetDiagnosticID(), bytesUp, bytesDown) 1627 1628 if err == nil { 1629 1630 NoticeInfo("shutdown operate tunnel") 1631 1632 // This commanded shutdown case is initiated by Tunnel.Close, which will 1633 // wait up to parameters.TunnelOperateShutdownTimeout to allow the following 1634 // requests to complete. 1635 1636 // Send a final status request in order to report any outstanding persistent 1637 // stats and domain bytes transferred as soon as possible. 1638 1639 sendStats(tunnel) 1640 1641 // The controller connectedReporter may have initiated a connected request 1642 // concurrent to this commanded shutdown. SetInFlightConnectedRequest 1643 // ensures that a connected request doesn't start after the commanded 1644 // shutdown. AwaitInFlightConnectedRequest blocks until any in flight 1645 // request completes or is aborted after TunnelOperateShutdownTimeout. 1646 // 1647 // As any connected request is performed by a concurrent goroutine, 1648 // sendStats is called first and AwaitInFlightConnectedRequest second. 1649 1650 tunnel.AwaitInFlightConnectedRequest() 1651 1652 } else { 1653 1654 NoticeWarning("operate tunnel error for %s: %s", 1655 tunnel.dialParams.ServerEntry.GetDiagnosticID(), err) 1656 1657 tunnelOwner.SignalTunnelFailure(tunnel) 1658 } 1659 } 1660 1661 // sendSshKeepAlive is a helper which sends a keepalive@openssh.com request 1662 // on the specified SSH connections and returns true of the request succeeds 1663 // within a specified timeout. If the request fails, the associated conn is 1664 // closed, which will terminate the associated tunnel. 1665 func (tunnel *Tunnel) sendSshKeepAlive( 1666 isFirstPeriodicKeepAlive bool, 1667 isProbeKeepAlive bool, 1668 timeout time.Duration, 1669 bytesUp int64, 1670 bytesDown int64) error { 1671 1672 p := tunnel.getCustomParameters() 1673 1674 // Random padding to frustrate fingerprinting. 1675 request := prng.Padding( 1676 p.Int(parameters.SSHKeepAlivePaddingMinBytes), 1677 p.Int(parameters.SSHKeepAlivePaddingMaxBytes)) 1678 1679 speedTestSample := isFirstPeriodicKeepAlive 1680 if !speedTestSample { 1681 speedTestSample = p.WeightedCoinFlip( 1682 parameters.SSHKeepAliveSpeedTestSampleProbability) 1683 } 1684 1685 networkConnectivityPollPeriod := p.Duration( 1686 parameters.SSHKeepAliveNetworkConnectivityPollingPeriod) 1687 1688 resetOnFailure := p.WeightedCoinFlip( 1689 parameters.SSHKeepAliveResetOnFailureProbability) 1690 1691 p.Close() 1692 1693 // Note: there is no request context since SSH requests cannot be interrupted 1694 // directly. Closing the tunnel will interrupt the request. A timeout is set 1695 // to unblock this function, but the goroutine may not exit until the tunnel 1696 // is closed. 1697 1698 // Use a buffer of 1 as there are two senders and only one guaranteed receive. 1699 errChannel := make(chan error, 1) 1700 1701 afterFunc := time.AfterFunc(timeout, func() { 1702 errChannel <- errors.TraceNew("timed out") 1703 }) 1704 defer afterFunc.Stop() 1705 1706 go func() { 1707 1708 startTime := time.Now() 1709 1710 // Note: reading a reply is important for last-received-time tunnel 1711 // duration calculation. 1712 requestOk, response, err := tunnel.sshClient.SendRequest( 1713 "keepalive@openssh.com", true, request) 1714 1715 elapsedTime := time.Since(startTime) 1716 1717 errChannel <- err 1718 1719 success := (err == nil && requestOk) 1720 1721 if success && isProbeKeepAlive { 1722 NoticeInfo("Probe SSH keep-alive RTT: %s", elapsedTime) 1723 } 1724 1725 // Record the keep alive round trip as a speed test sample. The first 1726 // periodic keep alive is always recorded, as many tunnels are short-lived 1727 // and we want to ensure that some data is gathered. Subsequent keep alives 1728 // are recorded with some configurable probability, which, considering that 1729 // only the last SpeedTestMaxSampleCount samples are retained, enables 1730 // tuning the sampling frequency. 1731 1732 if success && speedTestSample { 1733 1734 err = tactics.AddSpeedTestSample( 1735 tunnel.config.GetParameters(), 1736 GetTacticsStorer(tunnel.config), 1737 tunnel.config.GetNetworkID(), 1738 tunnel.dialParams.ServerEntry.Region, 1739 tunnel.dialParams.TunnelProtocol, 1740 elapsedTime, 1741 request, 1742 response) 1743 if err != nil { 1744 NoticeWarning("AddSpeedTestSample failed: %s", errors.Trace(err)) 1745 } 1746 } 1747 }() 1748 1749 // While awaiting the response, poll the network connectivity state. If there 1750 // is network connectivity, on the same network, for the entire duration of 1751 // the keep alive request and the request fails, record a failed tunnel 1752 // event. 1753 // 1754 // The network connectivity heuristic is intended to reduce the number of 1755 // failed tunnels reported due to routine situations such as varying mobile 1756 // network conditions. The polling may produce false positives if the network 1757 // goes down and up between polling periods, or changes to a new network and 1758 // back to the previous network between polling periods. 1759 // 1760 // For platforms that don't provide a NetworkConnectivityChecker, it is 1761 // assumed that there is network connectivity. 1762 // 1763 // The approximate number of tunneled bytes successfully sent and received is 1764 // recorded in the failed tunnel event as a quality indicator. 1765 1766 ticker := time.NewTicker(networkConnectivityPollPeriod) 1767 defer ticker.Stop() 1768 continuousNetworkConnectivity := true 1769 networkID := tunnel.config.GetNetworkID() 1770 1771 var err error 1772 loop: 1773 for { 1774 select { 1775 case err = <-errChannel: 1776 break loop 1777 case <-ticker.C: 1778 connectivityChecker := tunnel.config.NetworkConnectivityChecker 1779 if (connectivityChecker != nil && 1780 connectivityChecker.HasNetworkConnectivity() != 1) || 1781 (networkID != tunnel.config.GetNetworkID()) { 1782 1783 continuousNetworkConnectivity = false 1784 } 1785 } 1786 } 1787 1788 err = errors.Trace(err) 1789 1790 if err != nil { 1791 tunnel.sshClient.Close() 1792 tunnel.conn.Close() 1793 1794 // Don't perform log or reset actions when the keep alive may have been 1795 // interrupted due to shutdown. 1796 1797 isShutdown := false 1798 select { 1799 case <-tunnel.operateCtx.Done(): 1800 isShutdown = true 1801 default: 1802 } 1803 1804 // Ensure that at most one of the two SSH keep alive workers (periodic and 1805 // probe) perform the log and reset actions. 1806 1807 wasHandled := atomic.CompareAndSwapInt32(&tunnel.handledSSHKeepAliveFailure, 0, 1) 1808 1809 if continuousNetworkConnectivity && 1810 !isShutdown && 1811 !wasHandled { 1812 1813 _ = RecordFailedTunnelStat( 1814 tunnel.config, 1815 tunnel.dialParams, 1816 tunnel.livenessTestMetrics, 1817 bytesUp, 1818 bytesDown, 1819 err) 1820 if tunnel.extraFailureAction != nil { 1821 tunnel.extraFailureAction() 1822 } 1823 1824 // SSHKeepAliveResetOnFailureProbability is set when a late-lifecycle 1825 // impaired protocol attack is suspected. With the given probability, reset 1826 // server affinity and replay parameters for this server to avoid 1827 // continuously reconnecting to the server and/or using the same protocol 1828 // and dial parameters. 1829 1830 if resetOnFailure { 1831 NoticeInfo("Delete dial parameters for %s", tunnel.dialParams.ServerEntry.GetDiagnosticID()) 1832 err := DeleteDialParameters(tunnel.dialParams.ServerEntry.IpAddress, tunnel.dialParams.NetworkID) 1833 if err != nil { 1834 NoticeWarning("DeleteDialParameters failed: %s", err) 1835 } 1836 NoticeInfo("Delete server affinity for %s", tunnel.dialParams.ServerEntry.GetDiagnosticID()) 1837 err = DeleteServerEntryAffinity(tunnel.dialParams.ServerEntry.IpAddress) 1838 if err != nil { 1839 NoticeWarning("DeleteServerEntryAffinity failed: %s", err) 1840 } 1841 } 1842 } 1843 } 1844 1845 return err 1846 } 1847 1848 // sendStats is a helper for sending session stats to the server. 1849 func sendStats(tunnel *Tunnel) bool { 1850 1851 // Tunnel does not have a serverContext when DisableApi is set 1852 if tunnel.serverContext == nil { 1853 return true 1854 } 1855 1856 // Skip when tunnel is discarded 1857 if tunnel.IsDiscarded() { 1858 return true 1859 } 1860 1861 err := tunnel.serverContext.DoStatusRequest(tunnel) 1862 if err != nil { 1863 NoticeWarning("DoStatusRequest failed for %s: %s", 1864 tunnel.dialParams.ServerEntry.GetDiagnosticID(), err) 1865 } 1866 1867 return err == nil 1868 }