github.com/astaguna/popon-core@v0.0.0-20231019235610-96e42d76a5ff/psiphon/tunnel.go (about)

     1  /*
     2   * Copyright (c) 2015, Psiphon Inc.
     3   * All rights reserved.
     4   *
     5   * This program is free software: you can redistribute it and/or modify
     6   * it under the terms of the GNU General Public License as published by
     7   * the Free Software Foundation, either version 3 of the License, or
     8   * (at your option) any later version.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package psiphon
    21  
    22  import (
    23  	"bytes"
    24  	"context"
    25  	"crypto/rand"
    26  	"encoding/base64"
    27  	"encoding/json"
    28  	std_errors "errors"
    29  	"fmt"
    30  	"io"
    31  	"io/ioutil"
    32  	"net"
    33  	"net/http"
    34  	"sync"
    35  	"sync/atomic"
    36  	"time"
    37  
    38  	"github.com/astaguna/popon-core/psiphon/common"
    39  	"github.com/astaguna/popon-core/psiphon/common/crypto/ssh"
    40  	"github.com/astaguna/popon-core/psiphon/common/errors"
    41  	"github.com/astaguna/popon-core/psiphon/common/obfuscator"
    42  	"github.com/astaguna/popon-core/psiphon/common/parameters"
    43  	"github.com/astaguna/popon-core/psiphon/common/prng"
    44  	"github.com/astaguna/popon-core/psiphon/common/protocol"
    45  	"github.com/astaguna/popon-core/psiphon/common/quic"
    46  	"github.com/astaguna/popon-core/psiphon/common/refraction"
    47  	"github.com/astaguna/popon-core/psiphon/common/tactics"
    48  	"github.com/astaguna/popon-core/psiphon/transferstats"
    49  )
    50  
    51  // Tunneler specifies the interface required by components that use tunnels.
    52  type Tunneler interface {
    53  
    54  	// Dial creates a tunneled connection.
    55  	//
    56  	// When split tunnel mode is enabled, the connection may be untunneled,
    57  	// depending on GeoIP classification of the destination.
    58  	//
    59  	// downstreamConn is an optional parameter which specifies a connection to be
    60  	// explicitly closed when the Dialed connection is closed. For instance, this
    61  	// is used to close downstreamConn App<->LocalProxy connections when the related
    62  	// LocalProxy<->SshPortForward connections close.
    63  	Dial(remoteAddr string, downstreamConn net.Conn) (conn net.Conn, err error)
    64  
    65  	DirectDial(remoteAddr string) (conn net.Conn, err error)
    66  
    67  	SignalComponentFailure()
    68  }
    69  
    70  // TunnelOwner specifies the interface required by Tunnel to notify its
    71  // owner when it has failed. The owner may, as in the case of the Controller,
    72  // remove the tunnel from its list of active tunnels.
    73  type TunnelOwner interface {
    74  	SignalSeededNewSLOK()
    75  	SignalTunnelFailure(tunnel *Tunnel)
    76  }
    77  
    78  // Tunnel is a connection to a Psiphon server. An established
    79  // tunnel includes a network connection to the specified server
    80  // and an SSH session built on top of that transport.
    81  type Tunnel struct {
    82  	mutex                          *sync.Mutex
    83  	config                         *Config
    84  	isActivated                    bool
    85  	isDiscarded                    bool
    86  	isClosed                       bool
    87  	dialParams                     *DialParameters
    88  	livenessTestMetrics            *livenessTestMetrics
    89  	extraFailureAction             func()
    90  	serverContext                  *ServerContext
    91  	monitoringStartTime            time.Time
    92  	conn                           *common.BurstMonitoredConn
    93  	sshClient                      *ssh.Client
    94  	sshServerRequests              <-chan *ssh.Request
    95  	operateWaitGroup               *sync.WaitGroup
    96  	operateCtx                     context.Context
    97  	stopOperate                    context.CancelFunc
    98  	signalPortForwardFailure       chan struct{}
    99  	totalPortForwardFailures       int
   100  	adjustedEstablishStartTime     time.Time
   101  	establishDuration              time.Duration
   102  	establishedTime                time.Time
   103  	handledSSHKeepAliveFailure     int32
   104  	inFlightConnectedRequestSignal chan struct{}
   105  }
   106  
   107  // getCustomParameters helpers wrap the verbose function call chain required
   108  // to get a current snapshot of the parameters.Parameters customized with the
   109  // dial parameters associated with a tunnel.
   110  
   111  func (tunnel *Tunnel) getCustomParameters() parameters.ParametersAccessor {
   112  	return getCustomParameters(tunnel.config, tunnel.dialParams)
   113  }
   114  
   115  func getCustomParameters(
   116  	config *Config, dialParams *DialParameters) parameters.ParametersAccessor {
   117  	return config.GetParameters().GetCustom(dialParams.NetworkLatencyMultiplier)
   118  }
   119  
   120  // ConnectTunnel first makes a network transport connection to the
   121  // Psiphon server and then establishes an SSH client session on top of
   122  // that transport. The SSH server is authenticated using the public
   123  // key in the server entry.
   124  // Depending on the server's capabilities, the connection may use
   125  // plain SSH over TCP, obfuscated SSH over TCP, or obfuscated SSH over
   126  // HTTP (meek protocol).
   127  // When requiredProtocol is not blank, that protocol is used. Otherwise,
   128  // the a random supported protocol is used.
   129  //
   130  // Call Activate on a connected tunnel to complete its establishment
   131  // before using.
   132  //
   133  // Tunnel establishment is split into two phases: connection, and
   134  // activation. The Controller will run many ConnectTunnel calls
   135  // concurrently and then, to avoid unnecessary overhead from making
   136  // handshake requests and starting operateTunnel from tunnels which
   137  // may be discarded, call Activate on connected tunnels sequentially
   138  // as necessary.
   139  func ConnectTunnel(
   140  	ctx context.Context,
   141  	config *Config,
   142  	adjustedEstablishStartTime time.Time,
   143  	dialParams *DialParameters) (*Tunnel, error) {
   144  
   145  	// Build transport layers and establish SSH connection. Note that
   146  	// dialConn and monitoredConn are the same network connection.
   147  	dialResult, err := dialTunnel(ctx, config, dialParams)
   148  	if err != nil {
   149  		return nil, errors.Trace(err)
   150  	}
   151  
   152  	// The tunnel is now connected
   153  	return &Tunnel{
   154  		mutex:               new(sync.Mutex),
   155  		config:              config,
   156  		dialParams:          dialParams,
   157  		livenessTestMetrics: dialResult.livenessTestMetrics,
   158  		extraFailureAction:  dialResult.extraFailureAction,
   159  		monitoringStartTime: dialResult.monitoringStartTime,
   160  		conn:                dialResult.monitoredConn,
   161  		sshClient:           dialResult.sshClient,
   162  		sshServerRequests:   dialResult.sshRequests,
   163  		// A buffer allows at least one signal to be sent even when the receiver is
   164  		// not listening. Senders should not block.
   165  		signalPortForwardFailure:   make(chan struct{}, 1),
   166  		adjustedEstablishStartTime: adjustedEstablishStartTime,
   167  	}, nil
   168  }
   169  
   170  // Activate completes the tunnel establishment, performing the handshake
   171  // request and starting operateTunnel, the worker that monitors the tunnel
   172  // and handles periodic management.
   173  func (tunnel *Tunnel) Activate(
   174  	ctx context.Context, tunnelOwner TunnelOwner) (retErr error) {
   175  
   176  	// Ensure that, unless the base context is cancelled, any replayed dial
   177  	// parameters are cleared, no longer to be retried, if the tunnel fails to
   178  	// activate.
   179  	activationSucceeded := false
   180  	baseCtx := ctx
   181  	defer func() {
   182  		if !activationSucceeded && baseCtx.Err() != context.Canceled {
   183  			tunnel.dialParams.Failed(tunnel.config)
   184  			if tunnel.extraFailureAction != nil {
   185  				tunnel.extraFailureAction()
   186  			}
   187  			if retErr != nil {
   188  				_ = RecordFailedTunnelStat(
   189  					tunnel.config,
   190  					tunnel.dialParams,
   191  					tunnel.livenessTestMetrics,
   192  					-1,
   193  					-1,
   194  					retErr)
   195  			}
   196  		}
   197  	}()
   198  
   199  	// Create a new Psiphon API server context for this tunnel. This includes
   200  	// performing a handshake request. If the handshake fails, this activation
   201  	// fails.
   202  	var serverContext *ServerContext
   203  	if !tunnel.config.DisableApi {
   204  		NoticeInfo(
   205  			"starting server context for %s",
   206  			tunnel.dialParams.ServerEntry.GetDiagnosticID())
   207  
   208  		// Call NewServerContext in a goroutine, as it blocks on a network operation,
   209  		// the handshake request, and would block shutdown. If the shutdown signal is
   210  		// received, close the tunnel, which will interrupt the handshake request
   211  		// that may be blocking NewServerContext.
   212  		//
   213  		// Timeout after PsiphonApiServerTimeoutSeconds. NewServerContext may not
   214  		// return if the tunnel network connection is unstable during the handshake
   215  		// request. At this point, there is no operateTunnel monitor that will detect
   216  		// this condition with SSH keep alives.
   217  
   218  		timeout := tunnel.getCustomParameters().Duration(
   219  			parameters.PsiphonAPIRequestTimeout)
   220  
   221  		if timeout > 0 {
   222  			var cancelFunc context.CancelFunc
   223  			ctx, cancelFunc = context.WithTimeout(ctx, timeout)
   224  			defer cancelFunc()
   225  		}
   226  
   227  		type newServerContextResult struct {
   228  			serverContext *ServerContext
   229  			err           error
   230  		}
   231  
   232  		resultChannel := make(chan newServerContextResult)
   233  
   234  		go func() {
   235  			serverContext, err := NewServerContext(tunnel)
   236  			resultChannel <- newServerContextResult{
   237  				serverContext: serverContext,
   238  				err:           err,
   239  			}
   240  		}()
   241  
   242  		var result newServerContextResult
   243  
   244  		select {
   245  		case result = <-resultChannel:
   246  		case <-ctx.Done():
   247  			result.err = ctx.Err()
   248  			// Interrupt the goroutine
   249  			tunnel.Close(true)
   250  			<-resultChannel
   251  		}
   252  
   253  		if result.err != nil {
   254  			return errors.Trace(result.err)
   255  		}
   256  
   257  		serverContext = result.serverContext
   258  	}
   259  
   260  	// The activation succeeded.
   261  	activationSucceeded = true
   262  
   263  	tunnel.mutex.Lock()
   264  
   265  	// It may happen that the tunnel gets closed while Activate is running.
   266  	// In this case, abort here, to ensure that the operateTunnel goroutine
   267  	// will not be launched after Close is called.
   268  	if tunnel.isClosed {
   269  		return errors.TraceNew("tunnel is closed")
   270  	}
   271  
   272  	tunnel.isActivated = true
   273  	tunnel.serverContext = serverContext
   274  
   275  	// establishDuration is the elapsed time between the controller starting tunnel
   276  	// establishment and this tunnel being established. The reported value represents
   277  	// how long the user waited between starting the client and having a usable tunnel;
   278  	// or how long between the client detecting an unexpected tunnel disconnect and
   279  	// completing automatic reestablishment.
   280  	//
   281  	// This time period may include time spent unsuccessfully connecting to other
   282  	// servers. Time spent waiting for network connectivity is excluded.
   283  	tunnel.establishDuration = time.Since(tunnel.adjustedEstablishStartTime)
   284  	tunnel.establishedTime = time.Now()
   285  
   286  	// Use the Background context instead of the controller run context, as tunnels
   287  	// are terminated when the controller calls tunnel.Close.
   288  	tunnel.operateCtx, tunnel.stopOperate = context.WithCancel(context.Background())
   289  	tunnel.operateWaitGroup = new(sync.WaitGroup)
   290  
   291  	// Spawn the operateTunnel goroutine, which monitors the tunnel and handles periodic
   292  	// stats updates.
   293  	tunnel.operateWaitGroup.Add(1)
   294  	go tunnel.operateTunnel(tunnelOwner)
   295  
   296  	tunnel.mutex.Unlock()
   297  
   298  	return nil
   299  }
   300  
   301  // Close stops operating the tunnel and closes the underlying connection.
   302  // Supports multiple and/or concurrent calls to Close().
   303  // When isDiscarded is set, operateTunnel will not attempt to send final
   304  // status requests.
   305  func (tunnel *Tunnel) Close(isDiscarded bool) {
   306  
   307  	tunnel.mutex.Lock()
   308  	tunnel.isDiscarded = isDiscarded
   309  	isActivated := tunnel.isActivated
   310  	isClosed := tunnel.isClosed
   311  	tunnel.isClosed = true
   312  	tunnel.mutex.Unlock()
   313  
   314  	if !isClosed {
   315  
   316  		// Signal operateTunnel to stop before closing the tunnel -- this
   317  		// allows a final status request to be made in the case of an orderly
   318  		// shutdown.
   319  		// A timer is set, so if operateTunnel takes too long to stop, the
   320  		// tunnel is closed, which will interrupt any slow final status request.
   321  
   322  		if isActivated {
   323  			timeout := tunnel.getCustomParameters().Duration(
   324  				parameters.TunnelOperateShutdownTimeout)
   325  			afterFunc := time.AfterFunc(
   326  				timeout,
   327  				func() { tunnel.conn.Close() })
   328  			tunnel.stopOperate()
   329  			tunnel.operateWaitGroup.Wait()
   330  			afterFunc.Stop()
   331  		}
   332  
   333  		tunnel.sshClient.Close()
   334  		// tunnel.conn.Close() may get called multiple times, which is allowed.
   335  		tunnel.conn.Close()
   336  
   337  		err := tunnel.sshClient.Wait()
   338  		if err != nil {
   339  			NoticeWarning("close tunnel ssh error: %s", err)
   340  		}
   341  	}
   342  
   343  	// Log burst metrics now that the BurstMonitoredConn is closed.
   344  	// Metrics will be empty when burst monitoring is disabled.
   345  	if !isDiscarded && isActivated {
   346  		burstMetrics := tunnel.conn.GetMetrics(tunnel.monitoringStartTime)
   347  		if len(burstMetrics) > 0 {
   348  			NoticeBursts(
   349  				tunnel.dialParams.ServerEntry.GetDiagnosticID(),
   350  				burstMetrics)
   351  		}
   352  	}
   353  }
   354  
   355  // SetInFlightConnectedRequest checks if a connected request can begin and
   356  // sets the channel used to signal that the request is complete.
   357  //
   358  // The caller must not initiate a connected request when
   359  // SetInFlightConnectedRequest returns false. When SetInFlightConnectedRequest
   360  // returns true, the caller must call SetInFlightConnectedRequest(nil) when
   361  // the connected request completes.
   362  func (tunnel *Tunnel) SetInFlightConnectedRequest(requestSignal chan struct{}) bool {
   363  	tunnel.mutex.Lock()
   364  	defer tunnel.mutex.Unlock()
   365  
   366  	// If already closing, don't start a connected request: the
   367  	// TunnelOperateShutdownTimeout period may be nearly expired.
   368  	if tunnel.isClosed {
   369  		return false
   370  	}
   371  
   372  	if requestSignal == nil {
   373  		// Not already in-flight (not expected)
   374  		if tunnel.inFlightConnectedRequestSignal == nil {
   375  			return false
   376  		}
   377  	} else {
   378  		// Already in-flight (not expected)
   379  		if tunnel.inFlightConnectedRequestSignal != nil {
   380  			return false
   381  		}
   382  	}
   383  
   384  	tunnel.inFlightConnectedRequestSignal = requestSignal
   385  
   386  	return true
   387  }
   388  
   389  // AwaitInFlightConnectedRequest waits for the signal that any in-flight
   390  // connected request is complete.
   391  //
   392  // AwaitInFlightConnectedRequest may block until the connected request is
   393  // aborted by terminating the tunnel.
   394  func (tunnel *Tunnel) AwaitInFlightConnectedRequest() {
   395  	tunnel.mutex.Lock()
   396  	requestSignal := tunnel.inFlightConnectedRequestSignal
   397  	tunnel.mutex.Unlock()
   398  
   399  	if requestSignal != nil {
   400  		<-requestSignal
   401  	}
   402  }
   403  
   404  // IsActivated returns the tunnel's activated flag.
   405  func (tunnel *Tunnel) IsActivated() bool {
   406  	tunnel.mutex.Lock()
   407  	defer tunnel.mutex.Unlock()
   408  	return tunnel.isActivated
   409  }
   410  
   411  // IsDiscarded returns the tunnel's discarded flag.
   412  func (tunnel *Tunnel) IsDiscarded() bool {
   413  	tunnel.mutex.Lock()
   414  	defer tunnel.mutex.Unlock()
   415  	return tunnel.isDiscarded
   416  }
   417  
   418  // SendAPIRequest sends an API request as an SSH request through the tunnel.
   419  // This function blocks awaiting a response. Only one request may be in-flight
   420  // at once; a concurrent SendAPIRequest will block until an active request
   421  // receives its response (or the SSH connection is terminated).
   422  func (tunnel *Tunnel) SendAPIRequest(
   423  	name string, requestPayload []byte) ([]byte, error) {
   424  
   425  	ok, responsePayload, err := tunnel.sshClient.Conn.SendRequest(
   426  		name, true, requestPayload)
   427  
   428  	if err != nil {
   429  		return nil, errors.Trace(err)
   430  	}
   431  
   432  	if !ok {
   433  		return nil, errors.TraceNew("API request rejected")
   434  	}
   435  
   436  	return responsePayload, nil
   437  }
   438  
   439  // DialTCPChannel establishes a TCP port forward connection through the
   440  // tunnel.
   441  //
   442  // When split tunnel mode is enabled, and unless alwaysTunneled is set, the
   443  // server may reject the port forward and indicate that the client is to make
   444  // direct, untunneled connection. In this case, the bool return value is true
   445  // and net.Conn and error are nil.
   446  //
   447  // downstreamConn is an optional parameter which specifies a connection to be
   448  // explicitly closed when the dialed connection is closed.
   449  func (tunnel *Tunnel) DialTCPChannel(
   450  	remoteAddr string,
   451  	alwaysTunneled bool,
   452  	downstreamConn net.Conn) (net.Conn, bool, error) {
   453  
   454  	channelType := "direct-tcpip"
   455  	if alwaysTunneled && tunnel.config.IsSplitTunnelEnabled() {
   456  		// This channel type is only necessary in split tunnel mode.
   457  		channelType = protocol.TCP_PORT_FORWARD_NO_SPLIT_TUNNEL_TYPE
   458  	}
   459  
   460  	channel, err := tunnel.dialChannel(channelType, remoteAddr)
   461  	if err != nil {
   462  		if isSplitTunnelRejectReason(err) {
   463  			return nil, true, nil
   464  		}
   465  		return nil, false, errors.Trace(err)
   466  	}
   467  
   468  	netConn, ok := channel.(net.Conn)
   469  	if !ok {
   470  		return nil, false, errors.Tracef("unexpected channel type: %T", channel)
   471  	}
   472  
   473  	conn := &TunneledConn{
   474  		Conn:           netConn,
   475  		tunnel:         tunnel,
   476  		downstreamConn: downstreamConn}
   477  
   478  	return tunnel.wrapWithTransferStats(conn), false, nil
   479  }
   480  
   481  func (tunnel *Tunnel) DialPacketTunnelChannel() (net.Conn, error) {
   482  
   483  	channel, err := tunnel.dialChannel(protocol.PACKET_TUNNEL_CHANNEL_TYPE, "")
   484  	if err != nil {
   485  		return nil, errors.Trace(err)
   486  	}
   487  
   488  	sshChannel, ok := channel.(ssh.Channel)
   489  	if !ok {
   490  		return nil, errors.Tracef("unexpected channel type: %T", channel)
   491  	}
   492  
   493  	NoticeInfo("DialPacketTunnelChannel: established channel")
   494  
   495  	conn := newChannelConn(sshChannel)
   496  
   497  	// wrapWithTransferStats will track bytes transferred for the
   498  	// packet tunnel. It will count packet overhead (TCP/UDP/IP headers).
   499  	//
   500  	// Since the data in the channel is not HTTP or TLS, no domain bytes
   501  	// counting is expected.
   502  	//
   503  	// transferstats are also used to determine that there's been recent
   504  	// activity and skip periodic SSH keep alives; see Tunnel.operateTunnel.
   505  
   506  	return tunnel.wrapWithTransferStats(conn), nil
   507  }
   508  
   509  func (tunnel *Tunnel) dialChannel(channelType, remoteAddr string) (interface{}, error) {
   510  
   511  	if !tunnel.IsActivated() {
   512  		return nil, errors.TraceNew("tunnel is not activated")
   513  	}
   514  
   515  	// Note: there is no dial context since SSH port forward dials cannot
   516  	// be interrupted directly. Closing the tunnel will interrupt the dials.
   517  	// A timeout is set to unblock this function, but the goroutine may
   518  	// not exit until the tunnel is closed.
   519  
   520  	type channelDialResult struct {
   521  		channel interface{}
   522  		err     error
   523  	}
   524  
   525  	// Use a buffer of 1 as there are two senders and only one guaranteed receive.
   526  
   527  	results := make(chan *channelDialResult, 1)
   528  
   529  	afterFunc := time.AfterFunc(
   530  		tunnel.getCustomParameters().Duration(
   531  			parameters.TunnelPortForwardDialTimeout),
   532  		func() {
   533  			results <- &channelDialResult{
   534  				nil, errors.Tracef("channel dial timeout: %s", channelType)}
   535  		})
   536  	defer afterFunc.Stop()
   537  
   538  	go func() {
   539  		result := new(channelDialResult)
   540  		switch channelType {
   541  
   542  		case "direct-tcpip", protocol.TCP_PORT_FORWARD_NO_SPLIT_TUNNEL_TYPE:
   543  			// The protocol.TCP_PORT_FORWARD_NO_SPLIT_TUNNEL_TYPE is the same as
   544  			// "direct-tcpip", except split tunnel channel rejections are disallowed
   545  			// even when split tunnel mode is enabled.
   546  			result.channel, result.err =
   547  				tunnel.sshClient.Dial(channelType, remoteAddr)
   548  
   549  		default:
   550  			var sshRequests <-chan *ssh.Request
   551  			result.channel, sshRequests, result.err =
   552  				tunnel.sshClient.OpenChannel(channelType, nil)
   553  			if result.err == nil {
   554  				go ssh.DiscardRequests(sshRequests)
   555  			}
   556  		}
   557  		if result.err != nil {
   558  			result.err = errors.Trace(result.err)
   559  		}
   560  		results <- result
   561  	}()
   562  
   563  	result := <-results
   564  
   565  	if result.err != nil {
   566  		if !isSplitTunnelRejectReason(result.err) {
   567  			select {
   568  			case tunnel.signalPortForwardFailure <- struct{}{}:
   569  			default:
   570  			}
   571  		}
   572  		return nil, errors.Trace(result.err)
   573  	}
   574  
   575  	return result.channel, nil
   576  }
   577  
   578  func isSplitTunnelRejectReason(err error) bool {
   579  
   580  	var openChannelErr *ssh.OpenChannelError
   581  	if std_errors.As(err, &openChannelErr) {
   582  		return openChannelErr.Reason ==
   583  			ssh.RejectionReason(protocol.CHANNEL_REJECT_REASON_SPLIT_TUNNEL)
   584  	}
   585  
   586  	return false
   587  }
   588  
   589  func (tunnel *Tunnel) wrapWithTransferStats(conn net.Conn) net.Conn {
   590  
   591  	// Tunnel does not have a serverContext when DisableApi is set. We still use
   592  	// transferstats.Conn to count bytes transferred for monitoring tunnel
   593  	// quality.
   594  	var regexps *transferstats.Regexps
   595  	if tunnel.serverContext != nil {
   596  		regexps = tunnel.serverContext.StatsRegexps()
   597  	}
   598  
   599  	return transferstats.NewConn(
   600  		conn, tunnel.dialParams.ServerEntry.IpAddress, regexps)
   601  }
   602  
   603  // SignalComponentFailure notifies the tunnel that an associated component has failed.
   604  // This will terminate the tunnel.
   605  func (tunnel *Tunnel) SignalComponentFailure() {
   606  	NoticeWarning("tunnel received component failure signal")
   607  	tunnel.Close(false)
   608  }
   609  
   610  // TunneledConn implements net.Conn and wraps a port forward connection.
   611  // It is used to hook into Read and Write to observe I/O errors and
   612  // report these errors back to the tunnel monitor as port forward failures.
   613  // TunneledConn optionally tracks a peer connection to be explicitly closed
   614  // when the TunneledConn is closed.
   615  type TunneledConn struct {
   616  	net.Conn
   617  	tunnel         *Tunnel
   618  	downstreamConn net.Conn
   619  }
   620  
   621  func (conn *TunneledConn) Read(buffer []byte) (n int, err error) {
   622  	n, err = conn.Conn.Read(buffer)
   623  	if err != nil && err != io.EOF {
   624  		// Report new failure. Won't block; assumes the receiver
   625  		// has a sufficient buffer for the threshold number of reports.
   626  		// TODO: conditional on type of error or error message?
   627  		select {
   628  		case conn.tunnel.signalPortForwardFailure <- struct{}{}:
   629  		default:
   630  		}
   631  	}
   632  	return
   633  }
   634  
   635  func (conn *TunneledConn) Write(buffer []byte) (n int, err error) {
   636  	n, err = conn.Conn.Write(buffer)
   637  	if err != nil && err != io.EOF {
   638  		// Same as TunneledConn.Read()
   639  		select {
   640  		case conn.tunnel.signalPortForwardFailure <- struct{}{}:
   641  		default:
   642  		}
   643  	}
   644  	return
   645  }
   646  
   647  func (conn *TunneledConn) Close() error {
   648  	if conn.downstreamConn != nil {
   649  		conn.downstreamConn.Close()
   650  	}
   651  	return conn.Conn.Close()
   652  }
   653  
   654  type dialResult struct {
   655  	dialConn            net.Conn
   656  	monitoringStartTime time.Time
   657  	monitoredConn       *common.BurstMonitoredConn
   658  	sshClient           *ssh.Client
   659  	sshRequests         <-chan *ssh.Request
   660  	livenessTestMetrics *livenessTestMetrics
   661  	extraFailureAction  func()
   662  }
   663  
   664  // dialTunnel is a helper that builds the transport layers and establishes the
   665  // SSH connection. When additional dial configuration is used, dial metrics
   666  // are recorded and returned.
   667  func dialTunnel(
   668  	ctx context.Context,
   669  	config *Config,
   670  	dialParams *DialParameters) (_ *dialResult, retErr error) {
   671  
   672  	// Return immediately when overall context is canceled or timed-out. This
   673  	// avoids notice noise.
   674  	err := ctx.Err()
   675  	if err != nil {
   676  		return nil, errors.Trace(err)
   677  	}
   678  
   679  	p := getCustomParameters(config, dialParams)
   680  	timeout := p.Duration(parameters.TunnelConnectTimeout)
   681  	rateLimits := p.RateLimits(parameters.TunnelRateLimits)
   682  	obfuscatedSSHMinPadding := p.Int(parameters.ObfuscatedSSHMinPadding)
   683  	obfuscatedSSHMaxPadding := p.Int(parameters.ObfuscatedSSHMaxPadding)
   684  	livenessTestMinUpstreamBytes := p.Int(parameters.LivenessTestMinUpstreamBytes)
   685  	livenessTestMaxUpstreamBytes := p.Int(parameters.LivenessTestMaxUpstreamBytes)
   686  	livenessTestMinDownstreamBytes := p.Int(parameters.LivenessTestMinDownstreamBytes)
   687  	livenessTestMaxDownstreamBytes := p.Int(parameters.LivenessTestMaxDownstreamBytes)
   688  	burstUpstreamTargetBytes := int64(p.Int(parameters.ClientBurstUpstreamTargetBytes))
   689  	burstUpstreamDeadline := p.Duration(parameters.ClientBurstUpstreamDeadline)
   690  	burstDownstreamTargetBytes := int64(p.Int(parameters.ClientBurstDownstreamTargetBytes))
   691  	burstDownstreamDeadline := p.Duration(parameters.ClientBurstDownstreamDeadline)
   692  	tlsOSSHApplyTrafficShaping := p.WeightedCoinFlip(parameters.TLSTunnelTrafficShapingProbability)
   693  	tlsOSSHMinTLSPadding := p.Int(parameters.TLSTunnelMinTLSPadding)
   694  	tlsOSSHMaxTLSPadding := p.Int(parameters.TLSTunnelMaxTLSPadding)
   695  	p.Close()
   696  
   697  	// Ensure that, unless the base context is cancelled, any replayed dial
   698  	// parameters are cleared, no longer to be retried, if the tunnel fails to
   699  	// connect.
   700  	//
   701  	// Limitation: dials that fail to connect due to the server being in a
   702  	// load-limiting state are not distinguished and excepted from this
   703  	// logic.
   704  	dialSucceeded := false
   705  	baseCtx := ctx
   706  	var failedTunnelLivenessTestMetrics *livenessTestMetrics
   707  	var extraFailureAction func()
   708  	defer func() {
   709  		if !dialSucceeded && baseCtx.Err() != context.Canceled {
   710  			dialParams.Failed(config)
   711  			if extraFailureAction != nil {
   712  				extraFailureAction()
   713  			}
   714  			if retErr != nil {
   715  				_ = RecordFailedTunnelStat(
   716  					config,
   717  					dialParams,
   718  					failedTunnelLivenessTestMetrics,
   719  					-1,
   720  					-1,
   721  					retErr)
   722  			}
   723  		}
   724  	}()
   725  
   726  	var cancelFunc context.CancelFunc
   727  	ctx, cancelFunc = context.WithTimeout(ctx, timeout)
   728  	defer cancelFunc()
   729  
   730  	// DialDuration is the elapsed time for both successful and failed tunnel
   731  	// dials. For successful tunnels, it includes any the network protocol
   732  	// handshake(s), obfuscation protocol handshake(s), SSH handshake, and
   733  	// liveness test, when performed.
   734  	//
   735  	// Note: ensure DialDuration is set before calling any function which logs
   736  	// dial_duration.
   737  
   738  	startDialTime := time.Now()
   739  	defer func() {
   740  		dialParams.DialDuration = time.Since(startDialTime)
   741  	}()
   742  
   743  	// Note: dialParams.MeekResolvedIPAddress isn't set until the dial begins,
   744  	// so it will always be blank in NoticeConnectingServer.
   745  
   746  	NoticeConnectingServer(dialParams)
   747  
   748  	// Create the base transport: meek or direct connection
   749  
   750  	var dialConn net.Conn
   751  
   752  	if protocol.TunnelProtocolUsesTLSOSSH(dialParams.TunnelProtocol) {
   753  
   754  		dialConn, err = DialTLSTunnel(
   755  			ctx,
   756  			dialParams.GetTLSOSSHConfig(config),
   757  			dialParams.GetDialConfig(),
   758  			tlsOSSHApplyTrafficShaping,
   759  			tlsOSSHMinTLSPadding,
   760  			tlsOSSHMaxTLSPadding)
   761  		if err != nil {
   762  			return nil, errors.Trace(err)
   763  		}
   764  
   765  	} else if protocol.TunnelProtocolUsesMeek(dialParams.TunnelProtocol) {
   766  
   767  		dialConn, err = DialMeek(
   768  			ctx,
   769  			dialParams.GetMeekConfig(),
   770  			dialParams.GetDialConfig())
   771  		if err != nil {
   772  			return nil, errors.Trace(err)
   773  		}
   774  
   775  	} else if protocol.TunnelProtocolUsesQUIC(dialParams.TunnelProtocol) {
   776  
   777  		packetConn, remoteAddr, err := NewUDPConn(
   778  			ctx,
   779  			dialParams.DirectDialAddress,
   780  			dialParams.GetDialConfig())
   781  		if err != nil {
   782  			return nil, errors.Trace(err)
   783  		}
   784  
   785  		dialConn, err = quic.Dial(
   786  			ctx,
   787  			packetConn,
   788  			remoteAddr,
   789  			dialParams.QUICDialSNIAddress,
   790  			dialParams.QUICVersion,
   791  			dialParams.QUICClientHelloSeed,
   792  			dialParams.ServerEntry.SshObfuscatedKey,
   793  			dialParams.ObfuscatedQUICPaddingSeed,
   794  			dialParams.ObfuscatedQUICNonceTransformerParameters,
   795  			dialParams.QUICDisablePathMTUDiscovery)
   796  		if err != nil {
   797  			return nil, errors.Trace(err)
   798  		}
   799  
   800  	} else if protocol.TunnelProtocolUsesTapDance(dialParams.TunnelProtocol) {
   801  
   802  		dialConn, err = refraction.DialTapDance(
   803  			ctx,
   804  			config.EmitRefractionNetworkingLogs,
   805  			config.GetPsiphonDataDirectory(),
   806  			NewNetDialer(dialParams.GetDialConfig()),
   807  			dialParams.DirectDialAddress)
   808  		if err != nil {
   809  			return nil, errors.Trace(err)
   810  		}
   811  
   812  	} else if protocol.TunnelProtocolUsesConjure(dialParams.TunnelProtocol) {
   813  
   814  		// Specify a cache key with a scope that ensures that:
   815  		//
   816  		// (a) cached registrations aren't used across different networks, as a
   817  		// registration requires the client's public IP to match the value at time
   818  		// of registration;
   819  		//
   820  		// (b) cached registrations are associated with specific Psiphon server
   821  		// candidates, to ensure that replay will use the same phantom IP(s).
   822  		//
   823  		// This scheme allows for reuse of cached registrations on network A when a
   824  		// client roams from network A to network B and back to network A.
   825  		//
   826  		// Using the network ID as a proxy for client public IP address is a
   827  		// heurisitic: it's possible that a clients public IP address changes
   828  		// without the network ID changing, and it's not guaranteed that the client
   829  		// will be assigned the original public IP on network A; so there's some
   830  		// chance the registration cannot be reused.
   831  
   832  		diagnosticID := dialParams.ServerEntry.GetDiagnosticID()
   833  
   834  		cacheKey := dialParams.NetworkID + "-" + diagnosticID
   835  
   836  		conjureConfig := &refraction.ConjureConfig{
   837  			RegistrationCacheTTL: dialParams.ConjureCachedRegistrationTTL,
   838  			RegistrationCacheKey: cacheKey,
   839  			Transport:            dialParams.ConjureTransport,
   840  			DiagnosticID:         diagnosticID,
   841  			Logger:               NoticeCommonLogger(),
   842  		}
   843  
   844  		// Set extraFailureAction, which is invoked whenever the tunnel fails (i.e.,
   845  		// where RecordFailedTunnelStat is invoked). The action will remove any
   846  		// cached registration. When refraction.DialConjure succeeds, the underlying
   847  		// registration is cached. After refraction.DialConjure returns, it no
   848  		// longer modifies the cached state of that registration, assuming that it
   849  		// remains valid and effective. However adversarial impact on a given
   850  		// phantom IP may not become evident until after the initial TCP connection
   851  		// establishment and handshake performed by refraction.DialConjure. For
   852  		// example, it may be that the phantom dial is targeted for severe
   853  		// throttling which begins or is only evident later in the flow. Scheduling
   854  		// a call to DeleteCachedConjureRegistration allows us to invalidate the
   855  		// cached registration for a tunnel that fails later in its lifecycle.
   856  		//
   857  		// Note that extraFailureAction will retain a reference to conjureConfig for
   858  		// the lifetime of the tunnel.
   859  		extraFailureAction = func() {
   860  			refraction.DeleteCachedConjureRegistration(conjureConfig)
   861  		}
   862  
   863  		if dialParams.ConjureAPIRegistration {
   864  
   865  			// Use MeekConn to domain front Conjure API registration.
   866  			//
   867  			// ConjureAPIRegistrarFrontingSpecs are applied via
   868  			// dialParams.GetMeekConfig, and will be subject to replay.
   869  			//
   870  			// Since DialMeek will create a TLS connection immediately, and a cached
   871  			// registration may be used, we will delay initializing the MeekConn-based
   872  			// RoundTripper until we know it's needed. This is implemented by passing
   873  			// in a RoundTripper that establishes a MeekConn when RoundTrip is called.
   874  			//
   875  			// In refraction.dial we configure 0 retries for API registration requests,
   876  			// assuming it's better to let another Psiphon candidate retry, with new
   877  			// domaing fronting parameters. As such, we expect only one round trip call
   878  			// per NewHTTPRoundTripper, so, in practise, there's no performance penalty
   879  			// from establishing a new MeekConn per round trip.
   880  			//
   881  			// Performing the full DialMeek/RoundTrip operation here allows us to call
   882  			// MeekConn.Close and ensure all resources are immediately cleaned up.
   883  			roundTrip := func(request *http.Request) (*http.Response, error) {
   884  
   885  				conn, err := DialMeek(
   886  					ctx, dialParams.GetMeekConfig(), dialParams.GetDialConfig())
   887  				if err != nil {
   888  					return nil, errors.Trace(err)
   889  				}
   890  				defer conn.Close()
   891  
   892  				response, err := conn.RoundTrip(request)
   893  				if err != nil {
   894  					return nil, errors.Trace(err)
   895  				}
   896  
   897  				// Read the response into a buffer and close the response
   898  				// body, ensuring that MeekConn.Close closes all idle connections.
   899  				//
   900  				// Alternatively, we could Clone the request to set
   901  				// http.Request.Close and avoid keeping any idle connection
   902  				// open after the response body is read by gotapdance. Since
   903  				// the response body is small and since gotapdance does not
   904  				// stream the response body, we're taking this approach which
   905  				// ensures cleanup.
   906  
   907  				body, err := ioutil.ReadAll(response.Body)
   908  				_ = response.Body.Close()
   909  				if err != nil {
   910  					return nil, errors.Trace(err)
   911  				}
   912  				response.Body = io.NopCloser(bytes.NewReader(body))
   913  
   914  				return response, nil
   915  			}
   916  
   917  			conjureConfig.APIRegistrarHTTPClient = &http.Client{
   918  				Transport: common.NewHTTPRoundTripper(roundTrip),
   919  			}
   920  
   921  			conjureConfig.APIRegistrarBidirectionalURL =
   922  				dialParams.ConjureAPIRegistrarBidirectionalURL
   923  			conjureConfig.APIRegistrarDelay = dialParams.ConjureAPIRegistrarDelay
   924  
   925  		} else if dialParams.ConjureDecoyRegistration {
   926  
   927  			// The Conjure "phantom" connection is compatible with fragmentation, but
   928  			// the decoy registrar connection, like Tapdance, is not, so force it off.
   929  			// Any tunnel fragmentation metrics will refer to the "phantom" connection
   930  			// only.
   931  			conjureConfig.DecoyRegistrarDialer = NewNetDialer(
   932  				dialParams.GetDialConfig().WithoutFragmentor())
   933  			conjureConfig.DecoyRegistrarWidth = dialParams.ConjureDecoyRegistrarWidth
   934  			conjureConfig.DecoyRegistrarDelay = dialParams.ConjureDecoyRegistrarDelay
   935  		}
   936  
   937  		dialConn, err = refraction.DialConjure(
   938  			ctx,
   939  			config.EmitRefractionNetworkingLogs,
   940  			config.GetPsiphonDataDirectory(),
   941  			NewNetDialer(dialParams.GetDialConfig()),
   942  			dialParams.DirectDialAddress,
   943  			conjureConfig)
   944  		if err != nil {
   945  			return nil, errors.Trace(err)
   946  		}
   947  
   948  	} else {
   949  
   950  		dialConn, err = DialTCP(
   951  			ctx,
   952  			dialParams.DirectDialAddress,
   953  			dialParams.GetDialConfig())
   954  		if err != nil {
   955  			return nil, errors.Trace(err)
   956  		}
   957  	}
   958  
   959  	// Some conns report additional metrics. fragmentor.Conns report
   960  	// fragmentor configs.
   961  	if metricsSource, ok := dialConn.(common.MetricsSource); ok {
   962  		dialParams.DialConnMetrics = metricsSource
   963  	}
   964  
   965  	if noticeMetricsSource, ok := dialConn.(common.NoticeMetricsSource); ok {
   966  		dialParams.DialConnNoticeMetrics = noticeMetricsSource
   967  	}
   968  
   969  	// If dialConn is not a Closer, tunnel failure detection may be slower
   970  	if _, ok := dialConn.(common.Closer); !ok {
   971  		NoticeWarning("tunnel.dialTunnel: dialConn is not a Closer")
   972  	}
   973  
   974  	cleanupConn := dialConn
   975  	defer func() {
   976  		// Cleanup on error
   977  		if cleanupConn != nil {
   978  			cleanupConn.Close()
   979  		}
   980  	}()
   981  
   982  	monitoringStartTime := time.Now()
   983  	monitoredConn := common.NewBurstMonitoredConn(
   984  		dialConn,
   985  		false,
   986  		burstUpstreamTargetBytes, burstUpstreamDeadline,
   987  		burstDownstreamTargetBytes, burstDownstreamDeadline)
   988  
   989  	// Apply throttling (if configured)
   990  	throttledConn := common.NewThrottledConn(
   991  		monitoredConn,
   992  		rateLimits)
   993  
   994  	// Add obfuscated SSH layer
   995  	var sshConn net.Conn = throttledConn
   996  	if protocol.TunnelProtocolUsesObfuscatedSSH(dialParams.TunnelProtocol) {
   997  		obfuscatedSSHConn, err := obfuscator.NewClientObfuscatedSSHConn(
   998  			throttledConn,
   999  			dialParams.ServerEntry.SshObfuscatedKey,
  1000  			dialParams.ObfuscatorPaddingSeed,
  1001  			dialParams.OSSHObfuscatorSeedTransformerParameters,
  1002  			dialParams.OSSHPrefixSpec,
  1003  			dialParams.OSSHPrefixSplitConfig,
  1004  			&obfuscatedSSHMinPadding,
  1005  			&obfuscatedSSHMaxPadding)
  1006  		if err != nil {
  1007  			return nil, errors.Trace(err)
  1008  		}
  1009  		sshConn = obfuscatedSSHConn
  1010  		dialParams.ObfuscatedSSHConnMetrics = obfuscatedSSHConn
  1011  	}
  1012  
  1013  	// Now establish the SSH session over the conn transport
  1014  	expectedPublicKey, err := base64.StdEncoding.DecodeString(
  1015  		dialParams.ServerEntry.SshHostKey)
  1016  	if err != nil {
  1017  		return nil, errors.Trace(err)
  1018  	}
  1019  	sshCertChecker := &ssh.CertChecker{
  1020  		IsHostAuthority: func(auth ssh.PublicKey, address string) bool {
  1021  			// Psiphon servers do not currently use SSH certificates. This CertChecker
  1022  			// code path may still be hit if a client attempts to connect using an
  1023  			// obsolete server entry.
  1024  			return false
  1025  		},
  1026  		HostKeyFallback: func(addr string, remote net.Addr, publicKey ssh.PublicKey) error {
  1027  			if !bytes.Equal(expectedPublicKey, publicKey.Marshal()) {
  1028  				return errors.TraceNew("unexpected host public key")
  1029  			}
  1030  			return nil
  1031  		},
  1032  	}
  1033  
  1034  	sshPasswordPayload := &protocol.SSHPasswordPayload{
  1035  		SessionId:          config.SessionID,
  1036  		SshPassword:        dialParams.ServerEntry.SshPassword,
  1037  		ClientCapabilities: []string{protocol.CLIENT_CAPABILITY_SERVER_REQUESTS},
  1038  	}
  1039  
  1040  	payload, err := json.Marshal(sshPasswordPayload)
  1041  	if err != nil {
  1042  		return nil, errors.Trace(err)
  1043  	}
  1044  
  1045  	sshClientConfig := &ssh.ClientConfig{
  1046  		User: dialParams.ServerEntry.SshUsername,
  1047  		Auth: []ssh.AuthMethod{
  1048  			ssh.Password(string(payload)),
  1049  		},
  1050  		HostKeyCallback: sshCertChecker.CheckHostKey,
  1051  		ClientVersion:   dialParams.SSHClientVersion,
  1052  	}
  1053  
  1054  	sshClientConfig.KEXPRNGSeed = dialParams.SSHKEXSeed
  1055  
  1056  	if protocol.TunnelProtocolUsesObfuscatedSSH(dialParams.TunnelProtocol) {
  1057  		if config.ObfuscatedSSHAlgorithms != nil {
  1058  			sshClientConfig.KeyExchanges = []string{config.ObfuscatedSSHAlgorithms[0]}
  1059  			sshClientConfig.Ciphers = []string{config.ObfuscatedSSHAlgorithms[1]}
  1060  			sshClientConfig.MACs = []string{config.ObfuscatedSSHAlgorithms[2]}
  1061  			sshClientConfig.HostKeyAlgorithms = []string{config.ObfuscatedSSHAlgorithms[3]}
  1062  
  1063  		} else {
  1064  			// With Encrypt-then-MAC hash algorithms, packet length is
  1065  			// transmitted in plaintext, which aids in traffic analysis.
  1066  			//
  1067  			// TUNNEL_PROTOCOL_SSH is excepted since its KEX appears in plaintext,
  1068  			// and the protocol is intended to look like SSH on the wire.
  1069  			sshClientConfig.NoEncryptThenMACHash = true
  1070  		}
  1071  	} else {
  1072  		// For TUNNEL_PROTOCOL_SSH only, the server is expected to randomize
  1073  		// its KEX; setting PeerKEXPRNGSeed will ensure successful negotiation
  1074  		// betweem two randomized KEXes.
  1075  		if dialParams.ServerEntry.SshObfuscatedKey != "" {
  1076  			sshClientConfig.PeerKEXPRNGSeed, err = protocol.DeriveSSHServerKEXPRNGSeed(
  1077  				dialParams.ServerEntry.SshObfuscatedKey)
  1078  			if err != nil {
  1079  				return nil, errors.Trace(err)
  1080  			}
  1081  		}
  1082  	}
  1083  
  1084  	// The ssh session establishment (via ssh.NewClientConn) is wrapped
  1085  	// in a timeout to ensure it won't hang. We've encountered firewalls
  1086  	// that allow the TCP handshake to complete but then send a RST to the
  1087  	// server-side and nothing to the client-side, and if that happens
  1088  	// while ssh.NewClientConn is reading, it may wait forever. The timeout
  1089  	// closes the conn, which interrupts it.
  1090  	// Note: TCP handshake timeouts are provided by TCPConn, and session
  1091  	// timeouts *after* ssh establishment are provided by the ssh keep alive
  1092  	// in operate tunnel.
  1093  
  1094  	type sshNewClientResult struct {
  1095  		sshClient           *ssh.Client
  1096  		sshRequests         <-chan *ssh.Request
  1097  		livenessTestMetrics *livenessTestMetrics
  1098  		err                 error
  1099  	}
  1100  
  1101  	resultChannel := make(chan sshNewClientResult)
  1102  
  1103  	// Call NewClientConn in a goroutine, as it blocks on SSH handshake network
  1104  	// operations, and would block canceling or shutdown. If the parent context
  1105  	// is canceled, close the net.Conn underlying SSH, which will interrupt the
  1106  	// SSH handshake that may be blocking NewClientConn.
  1107  
  1108  	go func() {
  1109  		// The following is adapted from ssh.Dial(), here using a custom conn
  1110  		// The sshAddress is passed through to host key verification callbacks; we don't use it.
  1111  		sshAddress := ""
  1112  		sshClientConn, sshChannels, sshRequests, err := ssh.NewClientConn(
  1113  			sshConn, sshAddress, sshClientConfig)
  1114  
  1115  		var sshClient *ssh.Client
  1116  		var metrics *livenessTestMetrics
  1117  
  1118  		if err == nil {
  1119  
  1120  			// sshRequests is handled by operateTunnel.
  1121  			// ssh.NewClient also expects to handle the sshRequests
  1122  			// value from ssh.NewClientConn and will spawn a goroutine
  1123  			// to handle the  <-chan *ssh.Request, so we must provide
  1124  			// a closed channel to ensure that goroutine halts instead
  1125  			// of hanging on a nil channel.
  1126  			noRequests := make(chan *ssh.Request)
  1127  			close(noRequests)
  1128  
  1129  			sshClient = ssh.NewClient(sshClientConn, sshChannels, noRequests)
  1130  
  1131  			if livenessTestMaxUpstreamBytes > 0 || livenessTestMaxDownstreamBytes > 0 {
  1132  
  1133  				// When configured, perform a liveness test which sends and
  1134  				// receives bytes through the tunnel to ensure the tunnel had
  1135  				// not been blocked upon or shortly after connecting. This
  1136  				// test is performed concurrently for each establishment
  1137  				// candidate before selecting a successful tunnel.
  1138  				//
  1139  				// Note that the liveness test is subject to the
  1140  				// TunnelConnectTimeout, which should be adjusted
  1141  				// accordinging.
  1142  
  1143  				metrics, err = performLivenessTest(
  1144  					sshClient,
  1145  					livenessTestMinUpstreamBytes, livenessTestMaxUpstreamBytes,
  1146  					livenessTestMinDownstreamBytes, livenessTestMaxDownstreamBytes,
  1147  					dialParams.LivenessTestSeed)
  1148  
  1149  				// Skip notice when cancelling.
  1150  				if baseCtx.Err() == nil {
  1151  					NoticeLivenessTest(
  1152  						dialParams.ServerEntry.GetDiagnosticID(), metrics, err == nil)
  1153  				}
  1154  			}
  1155  		}
  1156  
  1157  		resultChannel <- sshNewClientResult{sshClient, sshRequests, metrics, err}
  1158  	}()
  1159  
  1160  	var result sshNewClientResult
  1161  
  1162  	select {
  1163  	case result = <-resultChannel:
  1164  	case <-ctx.Done():
  1165  
  1166  		// Interrupt the goroutine and capture its error context to
  1167  		// distinguish point of failure.
  1168  		err := ctx.Err()
  1169  		sshConn.Close()
  1170  		result = <-resultChannel
  1171  		if result.err != nil {
  1172  			result.err = fmt.Errorf("%s: %s", err, result.err)
  1173  		} else {
  1174  			result.err = err
  1175  		}
  1176  	}
  1177  
  1178  	if result.err != nil {
  1179  		failedTunnelLivenessTestMetrics = result.livenessTestMetrics
  1180  		return nil, errors.Trace(result.err)
  1181  	}
  1182  
  1183  	dialSucceeded = true
  1184  
  1185  	NoticeConnectedServer(dialParams)
  1186  
  1187  	cleanupConn = nil
  1188  
  1189  	// Invoke DNS cache extension (if enabled in the resolver) now that the
  1190  	// tunnel is connected and the Psiphon server is authenticated. This
  1191  	// demonstrates that any domain name resolved to an endpoint that is or
  1192  	// is forwarded to the expected Psiphon server.
  1193  	//
  1194  	// Limitation: DNS cache extension is not implemented for Refraction
  1195  	// Networking protocols. iOS VPN, the primary use case for DNS cache
  1196  	// extension, does not enable Refraction Networking.
  1197  	if protocol.TunnelProtocolUsesFrontedMeek(dialParams.TunnelProtocol) {
  1198  		resolver := config.GetResolver()
  1199  		if resolver != nil {
  1200  			resolver.VerifyCacheExtension(dialParams.MeekFrontingDialAddress)
  1201  		}
  1202  	}
  1203  
  1204  	// When configured to do so, hold-off on activating this tunnel. This allows
  1205  	// some extra time for slower but less resource intensive protocols to
  1206  	// establish tunnels. By holding off post-connect, the client has this
  1207  	// established tunnel ready to activate in case other protocols fail to
  1208  	// establish. This hold-off phase continues to consume one connection worker.
  1209  	//
  1210  	// The network latency multiplier is not applied to HoldOffTunnelDuration,
  1211  	// as the goal is to apply a consistent hold-off range across all tunnel
  1212  	// candidates; and this avoids scaling up any delay users experience.
  1213  	//
  1214  	// The hold-off is applied regardless of whether this is the first tunnel
  1215  	// in a session or a reconnection, even to a server affinity candidate,
  1216  	// so that the advantage for other protocols persists.
  1217  
  1218  	if dialParams.HoldOffTunnelDuration > 0 {
  1219  		NoticeHoldOffTunnel(dialParams.ServerEntry.GetDiagnosticID(), dialParams.HoldOffTunnelDuration)
  1220  		common.SleepWithContext(ctx, dialParams.HoldOffTunnelDuration)
  1221  	}
  1222  
  1223  	// Note: dialConn may be used to close the underlying network connection
  1224  	// but should not be used to perform I/O as that would interfere with SSH
  1225  	// (and also bypasses throttling).
  1226  
  1227  	return &dialResult{
  1228  			dialConn:            dialConn,
  1229  			monitoringStartTime: monitoringStartTime,
  1230  			monitoredConn:       monitoredConn,
  1231  			sshClient:           result.sshClient,
  1232  			sshRequests:         result.sshRequests,
  1233  			livenessTestMetrics: result.livenessTestMetrics,
  1234  			extraFailureAction:  extraFailureAction,
  1235  		},
  1236  		nil
  1237  }
  1238  
  1239  // Fields are exported for JSON encoding in NoticeLivenessTest.
  1240  type livenessTestMetrics struct {
  1241  	Duration                string
  1242  	UpstreamBytes           int
  1243  	SentUpstreamBytes       int
  1244  	DownstreamBytes         int
  1245  	ReceivedDownstreamBytes int
  1246  }
  1247  
  1248  func performLivenessTest(
  1249  	sshClient *ssh.Client,
  1250  	minUpstreamBytes, maxUpstreamBytes int,
  1251  	minDownstreamBytes, maxDownstreamBytes int,
  1252  	livenessTestPRNGSeed *prng.Seed) (*livenessTestMetrics, error) {
  1253  
  1254  	metrics := new(livenessTestMetrics)
  1255  
  1256  	defer func(startTime time.Time) {
  1257  		metrics.Duration = time.Since(startTime).String()
  1258  	}(time.Now())
  1259  
  1260  	PRNG := prng.NewPRNGWithSeed(livenessTestPRNGSeed)
  1261  
  1262  	metrics.UpstreamBytes = PRNG.Range(minUpstreamBytes, maxUpstreamBytes)
  1263  	metrics.DownstreamBytes = PRNG.Range(minDownstreamBytes, maxDownstreamBytes)
  1264  
  1265  	request := &protocol.RandomStreamRequest{
  1266  		UpstreamBytes:   metrics.UpstreamBytes,
  1267  		DownstreamBytes: metrics.DownstreamBytes,
  1268  	}
  1269  
  1270  	extraData, err := json.Marshal(request)
  1271  	if err != nil {
  1272  		return metrics, errors.Trace(err)
  1273  	}
  1274  
  1275  	channel, requests, err := sshClient.OpenChannel(
  1276  		protocol.RANDOM_STREAM_CHANNEL_TYPE, extraData)
  1277  	if err != nil {
  1278  		return metrics, errors.Trace(err)
  1279  	}
  1280  	defer channel.Close()
  1281  
  1282  	go ssh.DiscardRequests(requests)
  1283  
  1284  	sent := 0
  1285  	received := 0
  1286  	upstream := new(sync.WaitGroup)
  1287  	var errUpstream, errDownstream error
  1288  
  1289  	if metrics.UpstreamBytes > 0 {
  1290  
  1291  		// Process streams concurrently to minimize elapsed time. This also
  1292  		// avoids a unidirectional flow burst early in the tunnel lifecycle.
  1293  
  1294  		upstream.Add(1)
  1295  		go func() {
  1296  			defer upstream.Done()
  1297  
  1298  			// In consideration of memory-constrained environments, use modest-sized copy
  1299  			// buffers since many tunnel establishment workers may run the liveness test
  1300  			// concurrently.
  1301  			var buffer [4096]byte
  1302  
  1303  			n, err := common.CopyNBuffer(channel, rand.Reader, int64(metrics.UpstreamBytes), buffer[:])
  1304  			sent = int(n)
  1305  			if err != nil {
  1306  				errUpstream = errors.Trace(err)
  1307  			}
  1308  		}()
  1309  	}
  1310  
  1311  	if metrics.DownstreamBytes > 0 {
  1312  		var buffer [4096]byte
  1313  		n, err := common.CopyNBuffer(ioutil.Discard, channel, int64(metrics.DownstreamBytes), buffer[:])
  1314  		received = int(n)
  1315  		if err != nil {
  1316  			errDownstream = errors.Trace(err)
  1317  		}
  1318  	}
  1319  
  1320  	upstream.Wait()
  1321  	metrics.SentUpstreamBytes = sent
  1322  	metrics.ReceivedDownstreamBytes = received
  1323  
  1324  	if errUpstream != nil {
  1325  		return metrics, errUpstream
  1326  	} else if errDownstream != nil {
  1327  		return metrics, errDownstream
  1328  	}
  1329  
  1330  	return metrics, nil
  1331  }
  1332  
  1333  // operateTunnel monitors the health of the tunnel and performs
  1334  // periodic work.
  1335  //
  1336  // BytesTransferred and TotalBytesTransferred notices are emitted
  1337  // for live reporting and diagnostics reporting, respectively.
  1338  //
  1339  // Status requests are sent to the Psiphon API to report bytes
  1340  // transferred.
  1341  //
  1342  // Periodic SSH keep alive packets are sent to ensure the underlying
  1343  // TCP connection isn't terminated by NAT, or other network
  1344  // interference -- or test if it has been terminated while the device
  1345  // has been asleep. When a keep alive times out, the tunnel is
  1346  // considered failed.
  1347  //
  1348  // An immediate SSH keep alive "probe" is sent to test the tunnel and
  1349  // server responsiveness when a port forward failure is detected: a
  1350  // failed dial or failed read/write. This keep alive has a shorter
  1351  // timeout.
  1352  //
  1353  // Note that port forward failures may be due to non-failure conditions.
  1354  // For example, when the user inputs an invalid domain name and
  1355  // resolution is done by the ssh server; or trying to connect to a
  1356  // non-white-listed port; and the error message in these cases is not
  1357  // distinguishable from a a true server error (a common error message,
  1358  // "ssh: rejected: administratively prohibited (open failed)", may be
  1359  // returned for these cases but also if the server has run out of
  1360  // ephemeral ports, for example).
  1361  //
  1362  // SSH keep alives are not sent when the tunnel has been recently
  1363  // active (not only does tunnel activity obviate the necessity of a keep
  1364  // alive, testing has shown that keep alives may time out for "busy"
  1365  // tunnels, especially over meek protocol and other high latency
  1366  // conditions).
  1367  //
  1368  // "Recently active" is defined has having received payload bytes. Sent
  1369  // bytes are not considered as testing has shown bytes may appear to
  1370  // send when certain NAT devices have interfered with the tunnel, while
  1371  // no bytes are received. In a pathological case, with DNS implemented
  1372  // as tunneled UDP, a browser may wait excessively for a domain name to
  1373  // resolve, while no new port forward is attempted which would otherwise
  1374  // result in a tunnel failure detection.
  1375  //
  1376  // TODO: change "recently active" to include having received any
  1377  // SSH protocol messages from the server, not just user payload?
  1378  func (tunnel *Tunnel) operateTunnel(tunnelOwner TunnelOwner) {
  1379  	defer tunnel.operateWaitGroup.Done()
  1380  
  1381  	now := time.Now()
  1382  	lastBytesReceivedTime := now
  1383  	lastTotalBytesTransferedTime := now
  1384  	totalSent := int64(0)
  1385  	totalReceived := int64(0)
  1386  	setDialParamsSucceeded := false
  1387  
  1388  	noticeBytesTransferredTicker := time.NewTicker(1 * time.Second)
  1389  	defer noticeBytesTransferredTicker.Stop()
  1390  
  1391  	// The next status request and ssh keep alive times are picked at random,
  1392  	// from a range, to make the resulting traffic less fingerprintable,
  1393  	// Note: not using Tickers since these are not fixed time periods.
  1394  	nextStatusRequestPeriod := func() time.Duration {
  1395  		p := tunnel.getCustomParameters()
  1396  		return prng.Period(
  1397  			p.Duration(parameters.PsiphonAPIStatusRequestPeriodMin),
  1398  			p.Duration(parameters.PsiphonAPIStatusRequestPeriodMax))
  1399  	}
  1400  
  1401  	statsTimer := time.NewTimer(nextStatusRequestPeriod())
  1402  	defer statsTimer.Stop()
  1403  
  1404  	// Schedule an almost-immediate status request to deliver any unreported
  1405  	// persistent stats.
  1406  	unreported := CountUnreportedPersistentStats()
  1407  	if unreported > 0 {
  1408  		NoticeInfo("Unreported persistent stats: %d", unreported)
  1409  		p := tunnel.getCustomParameters()
  1410  		statsTimer.Reset(
  1411  			prng.Period(
  1412  				p.Duration(parameters.PsiphonAPIStatusRequestShortPeriodMin),
  1413  				p.Duration(parameters.PsiphonAPIStatusRequestShortPeriodMax)))
  1414  	}
  1415  
  1416  	nextSshKeepAlivePeriod := func() time.Duration {
  1417  		p := tunnel.getCustomParameters()
  1418  		return prng.Period(
  1419  			p.Duration(parameters.SSHKeepAlivePeriodMin),
  1420  			p.Duration(parameters.SSHKeepAlivePeriodMax))
  1421  	}
  1422  
  1423  	// TODO: don't initialize timer when config.DisablePeriodicSshKeepAlive is set
  1424  	sshKeepAliveTimer := time.NewTimer(nextSshKeepAlivePeriod())
  1425  	if tunnel.config.DisablePeriodicSshKeepAlive {
  1426  		sshKeepAliveTimer.Stop()
  1427  	} else {
  1428  		defer sshKeepAliveTimer.Stop()
  1429  	}
  1430  
  1431  	// Perform network requests in separate goroutines so as not to block
  1432  	// other operations.
  1433  	requestsWaitGroup := new(sync.WaitGroup)
  1434  
  1435  	requestsWaitGroup.Add(1)
  1436  	signalStatusRequest := make(chan struct{})
  1437  	go func() {
  1438  		defer requestsWaitGroup.Done()
  1439  		for range signalStatusRequest {
  1440  			sendStats(tunnel)
  1441  		}
  1442  	}()
  1443  
  1444  	requestsWaitGroup.Add(1)
  1445  	signalPeriodicSshKeepAlive := make(chan time.Duration)
  1446  	sshKeepAliveError := make(chan error, 1)
  1447  	go func() {
  1448  		defer requestsWaitGroup.Done()
  1449  		isFirstPeriodicKeepAlive := true
  1450  		for timeout := range signalPeriodicSshKeepAlive {
  1451  			bytesUp := atomic.LoadInt64(&totalSent)
  1452  			bytesDown := atomic.LoadInt64(&totalReceived)
  1453  			err := tunnel.sendSshKeepAlive(
  1454  				isFirstPeriodicKeepAlive, false, timeout, bytesUp, bytesDown)
  1455  			if err != nil {
  1456  				select {
  1457  				case sshKeepAliveError <- err:
  1458  				default:
  1459  				}
  1460  			}
  1461  			isFirstPeriodicKeepAlive = false
  1462  		}
  1463  	}()
  1464  
  1465  	// Probe-type SSH keep alives have a distinct send worker and may be sent
  1466  	// concurrently, to ensure a long period keep alive timeout doesn't delay
  1467  	// failed tunnel detection.
  1468  
  1469  	requestsWaitGroup.Add(1)
  1470  	signalProbeSshKeepAlive := make(chan time.Duration)
  1471  	go func() {
  1472  		defer requestsWaitGroup.Done()
  1473  		for timeout := range signalProbeSshKeepAlive {
  1474  			bytesUp := atomic.LoadInt64(&totalSent)
  1475  			bytesDown := atomic.LoadInt64(&totalReceived)
  1476  			err := tunnel.sendSshKeepAlive(
  1477  				false, true, timeout, bytesUp, bytesDown)
  1478  			if err != nil {
  1479  				select {
  1480  				case sshKeepAliveError <- err:
  1481  				default:
  1482  				}
  1483  			}
  1484  		}
  1485  	}()
  1486  
  1487  	shutdown := false
  1488  	var err error
  1489  	for !shutdown && err == nil {
  1490  		select {
  1491  		case <-noticeBytesTransferredTicker.C:
  1492  			sent, received := transferstats.ReportRecentBytesTransferredForServer(
  1493  				tunnel.dialParams.ServerEntry.IpAddress)
  1494  
  1495  			if received > 0 {
  1496  				lastBytesReceivedTime = time.Now()
  1497  			}
  1498  
  1499  			bytesUp := atomic.AddInt64(&totalSent, sent)
  1500  			bytesDown := atomic.AddInt64(&totalReceived, received)
  1501  
  1502  			p := tunnel.getCustomParameters()
  1503  			noticePeriod := p.Duration(parameters.TotalBytesTransferredNoticePeriod)
  1504  			doEmitMemoryMetrics := p.Bool(parameters.TotalBytesTransferredEmitMemoryMetrics)
  1505  			replayTargetUpstreamBytes := p.Int(parameters.ReplayTargetUpstreamBytes)
  1506  			replayTargetDownstreamBytes := p.Int(parameters.ReplayTargetDownstreamBytes)
  1507  			replayTargetTunnelDuration := p.Duration(parameters.ReplayTargetTunnelDuration)
  1508  
  1509  			if lastTotalBytesTransferedTime.Add(noticePeriod).Before(time.Now()) {
  1510  				NoticeTotalBytesTransferred(
  1511  					tunnel.dialParams.ServerEntry.GetDiagnosticID(), bytesUp, bytesDown)
  1512  				if doEmitMemoryMetrics {
  1513  					emitMemoryMetrics()
  1514  				}
  1515  				lastTotalBytesTransferedTime = time.Now()
  1516  			}
  1517  
  1518  			// Only emit the frequent BytesTransferred notice when tunnel is not idle.
  1519  			if tunnel.config.EmitBytesTransferred && (sent > 0 || received > 0) {
  1520  				NoticeBytesTransferred(
  1521  					tunnel.dialParams.ServerEntry.GetDiagnosticID(), sent, received)
  1522  			}
  1523  
  1524  			// Once the tunnel has connected, activated, successfully transmitted the
  1525  			// targeted number of bytes, and been up for the targeted duration
  1526  			// (measured from the end of establishment), store its dial parameters for
  1527  			// subsequent replay.
  1528  			//
  1529  			// Even when target bytes and duration are all 0, the tunnel must remain up
  1530  			// for at least 1 second due to use of noticeBytesTransferredTicker; for
  1531  			// the same reason the granularity of ReplayTargetTunnelDuration is
  1532  			// seconds.
  1533  			if !setDialParamsSucceeded &&
  1534  				bytesUp >= int64(replayTargetUpstreamBytes) &&
  1535  				bytesDown >= int64(replayTargetDownstreamBytes) &&
  1536  				time.Since(tunnel.establishedTime) >= replayTargetTunnelDuration {
  1537  
  1538  				tunnel.dialParams.Succeeded()
  1539  				setDialParamsSucceeded = true
  1540  			}
  1541  
  1542  		case <-statsTimer.C:
  1543  			select {
  1544  			case signalStatusRequest <- struct{}{}:
  1545  			default:
  1546  			}
  1547  			statsTimer.Reset(nextStatusRequestPeriod())
  1548  
  1549  		case <-sshKeepAliveTimer.C:
  1550  			p := tunnel.getCustomParameters()
  1551  			inactivePeriod := p.Duration(parameters.SSHKeepAlivePeriodicInactivePeriod)
  1552  			if lastBytesReceivedTime.Add(inactivePeriod).Before(time.Now()) {
  1553  				timeout := p.Duration(parameters.SSHKeepAlivePeriodicTimeout)
  1554  				select {
  1555  				case signalPeriodicSshKeepAlive <- timeout:
  1556  				default:
  1557  				}
  1558  			}
  1559  			sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod())
  1560  
  1561  		case <-tunnel.signalPortForwardFailure:
  1562  			// Note: no mutex on portForwardFailureTotal; only referenced here
  1563  			tunnel.totalPortForwardFailures++
  1564  			NoticeInfo("port forward failures for %s: %d",
  1565  				tunnel.dialParams.ServerEntry.GetDiagnosticID(),
  1566  				tunnel.totalPortForwardFailures)
  1567  
  1568  			// If the underlying Conn has closed (meek and other plugin protocols may
  1569  			// close themselves in certain error conditions), the tunnel has certainly
  1570  			// failed. Otherwise, probe with an SSH keep alive.
  1571  			//
  1572  			// TODO: the IsClosed case omits the failed tunnel logging and reset
  1573  			// actions performed by sendSshKeepAlive. Should self-closing protocols
  1574  			// perform these actions themselves?
  1575  
  1576  			if tunnel.conn.IsClosed() {
  1577  				err = errors.TraceNew("underlying conn is closed")
  1578  			} else {
  1579  				p := tunnel.getCustomParameters()
  1580  				inactivePeriod := p.Duration(parameters.SSHKeepAliveProbeInactivePeriod)
  1581  				if lastBytesReceivedTime.Add(inactivePeriod).Before(time.Now()) {
  1582  					timeout := p.Duration(parameters.SSHKeepAliveProbeTimeout)
  1583  					select {
  1584  					case signalProbeSshKeepAlive <- timeout:
  1585  					default:
  1586  					}
  1587  				}
  1588  				if !tunnel.config.DisablePeriodicSshKeepAlive {
  1589  					sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod())
  1590  				}
  1591  
  1592  			}
  1593  
  1594  		case err = <-sshKeepAliveError:
  1595  
  1596  		case serverRequest := <-tunnel.sshServerRequests:
  1597  			if serverRequest != nil {
  1598  				err := HandleServerRequest(tunnelOwner, tunnel, serverRequest.Type, serverRequest.Payload)
  1599  				if err == nil {
  1600  					serverRequest.Reply(true, nil)
  1601  				} else {
  1602  					NoticeWarning("HandleServerRequest for %s failed: %s", serverRequest.Type, err)
  1603  					serverRequest.Reply(false, nil)
  1604  
  1605  				}
  1606  			}
  1607  
  1608  		case <-tunnel.operateCtx.Done():
  1609  			shutdown = true
  1610  		}
  1611  	}
  1612  
  1613  	close(signalPeriodicSshKeepAlive)
  1614  	close(signalProbeSshKeepAlive)
  1615  	close(signalStatusRequest)
  1616  	requestsWaitGroup.Wait()
  1617  
  1618  	// Capture bytes transferred since the last noticeBytesTransferredTicker tick
  1619  	sent, received := transferstats.ReportRecentBytesTransferredForServer(
  1620  		tunnel.dialParams.ServerEntry.IpAddress)
  1621  	bytesUp := atomic.AddInt64(&totalSent, sent)
  1622  	bytesDown := atomic.AddInt64(&totalReceived, received)
  1623  
  1624  	// Always emit a final NoticeTotalBytesTransferred
  1625  	NoticeTotalBytesTransferred(
  1626  		tunnel.dialParams.ServerEntry.GetDiagnosticID(), bytesUp, bytesDown)
  1627  
  1628  	if err == nil {
  1629  
  1630  		NoticeInfo("shutdown operate tunnel")
  1631  
  1632  		// This commanded shutdown case is initiated by Tunnel.Close, which will
  1633  		// wait up to parameters.TunnelOperateShutdownTimeout to allow the following
  1634  		// requests to complete.
  1635  
  1636  		// Send a final status request in order to report any outstanding persistent
  1637  		// stats and domain bytes transferred as soon as possible.
  1638  
  1639  		sendStats(tunnel)
  1640  
  1641  		// The controller connectedReporter may have initiated a connected request
  1642  		// concurrent to this commanded shutdown. SetInFlightConnectedRequest
  1643  		// ensures that a connected request doesn't start after the commanded
  1644  		// shutdown. AwaitInFlightConnectedRequest blocks until any in flight
  1645  		// request completes or is aborted after TunnelOperateShutdownTimeout.
  1646  		//
  1647  		// As any connected request is performed by a concurrent goroutine,
  1648  		// sendStats is called first and AwaitInFlightConnectedRequest second.
  1649  
  1650  		tunnel.AwaitInFlightConnectedRequest()
  1651  
  1652  	} else {
  1653  
  1654  		NoticeWarning("operate tunnel error for %s: %s",
  1655  			tunnel.dialParams.ServerEntry.GetDiagnosticID(), err)
  1656  
  1657  		tunnelOwner.SignalTunnelFailure(tunnel)
  1658  	}
  1659  }
  1660  
  1661  // sendSshKeepAlive is a helper which sends a keepalive@openssh.com request
  1662  // on the specified SSH connections and returns true of the request succeeds
  1663  // within a specified timeout. If the request fails, the associated conn is
  1664  // closed, which will terminate the associated tunnel.
  1665  func (tunnel *Tunnel) sendSshKeepAlive(
  1666  	isFirstPeriodicKeepAlive bool,
  1667  	isProbeKeepAlive bool,
  1668  	timeout time.Duration,
  1669  	bytesUp int64,
  1670  	bytesDown int64) error {
  1671  
  1672  	p := tunnel.getCustomParameters()
  1673  
  1674  	// Random padding to frustrate fingerprinting.
  1675  	request := prng.Padding(
  1676  		p.Int(parameters.SSHKeepAlivePaddingMinBytes),
  1677  		p.Int(parameters.SSHKeepAlivePaddingMaxBytes))
  1678  
  1679  	speedTestSample := isFirstPeriodicKeepAlive
  1680  	if !speedTestSample {
  1681  		speedTestSample = p.WeightedCoinFlip(
  1682  			parameters.SSHKeepAliveSpeedTestSampleProbability)
  1683  	}
  1684  
  1685  	networkConnectivityPollPeriod := p.Duration(
  1686  		parameters.SSHKeepAliveNetworkConnectivityPollingPeriod)
  1687  
  1688  	resetOnFailure := p.WeightedCoinFlip(
  1689  		parameters.SSHKeepAliveResetOnFailureProbability)
  1690  
  1691  	p.Close()
  1692  
  1693  	// Note: there is no request context since SSH requests cannot be interrupted
  1694  	// directly. Closing the tunnel will interrupt the request. A timeout is set
  1695  	// to unblock this function, but the goroutine may not exit until the tunnel
  1696  	// is closed.
  1697  
  1698  	// Use a buffer of 1 as there are two senders and only one guaranteed receive.
  1699  	errChannel := make(chan error, 1)
  1700  
  1701  	afterFunc := time.AfterFunc(timeout, func() {
  1702  		errChannel <- errors.TraceNew("timed out")
  1703  	})
  1704  	defer afterFunc.Stop()
  1705  
  1706  	go func() {
  1707  
  1708  		startTime := time.Now()
  1709  
  1710  		// Note: reading a reply is important for last-received-time tunnel
  1711  		// duration calculation.
  1712  		requestOk, response, err := tunnel.sshClient.SendRequest(
  1713  			"keepalive@openssh.com", true, request)
  1714  
  1715  		elapsedTime := time.Since(startTime)
  1716  
  1717  		errChannel <- err
  1718  
  1719  		success := (err == nil && requestOk)
  1720  
  1721  		if success && isProbeKeepAlive {
  1722  			NoticeInfo("Probe SSH keep-alive RTT: %s", elapsedTime)
  1723  		}
  1724  
  1725  		// Record the keep alive round trip as a speed test sample. The first
  1726  		// periodic keep alive is always recorded, as many tunnels are short-lived
  1727  		// and we want to ensure that some data is gathered. Subsequent keep alives
  1728  		// are recorded with some configurable probability, which, considering that
  1729  		// only the last SpeedTestMaxSampleCount samples are retained, enables
  1730  		// tuning the sampling frequency.
  1731  
  1732  		if success && speedTestSample {
  1733  
  1734  			err = tactics.AddSpeedTestSample(
  1735  				tunnel.config.GetParameters(),
  1736  				GetTacticsStorer(tunnel.config),
  1737  				tunnel.config.GetNetworkID(),
  1738  				tunnel.dialParams.ServerEntry.Region,
  1739  				tunnel.dialParams.TunnelProtocol,
  1740  				elapsedTime,
  1741  				request,
  1742  				response)
  1743  			if err != nil {
  1744  				NoticeWarning("AddSpeedTestSample failed: %s", errors.Trace(err))
  1745  			}
  1746  		}
  1747  	}()
  1748  
  1749  	// While awaiting the response, poll the network connectivity state. If there
  1750  	// is network connectivity, on the same network, for the entire duration of
  1751  	// the keep alive request and the request fails, record a failed tunnel
  1752  	// event.
  1753  	//
  1754  	// The network connectivity heuristic is intended to reduce the number of
  1755  	// failed tunnels reported due to routine situations such as varying mobile
  1756  	// network conditions. The polling may produce false positives if the network
  1757  	// goes down and up between polling periods, or changes to a new network and
  1758  	// back to the previous network between polling periods.
  1759  	//
  1760  	// For platforms that don't provide a NetworkConnectivityChecker, it is
  1761  	// assumed that there is network connectivity.
  1762  	//
  1763  	// The approximate number of tunneled bytes successfully sent and received is
  1764  	// recorded in the failed tunnel event as a quality indicator.
  1765  
  1766  	ticker := time.NewTicker(networkConnectivityPollPeriod)
  1767  	defer ticker.Stop()
  1768  	continuousNetworkConnectivity := true
  1769  	networkID := tunnel.config.GetNetworkID()
  1770  
  1771  	var err error
  1772  loop:
  1773  	for {
  1774  		select {
  1775  		case err = <-errChannel:
  1776  			break loop
  1777  		case <-ticker.C:
  1778  			connectivityChecker := tunnel.config.NetworkConnectivityChecker
  1779  			if (connectivityChecker != nil &&
  1780  				connectivityChecker.HasNetworkConnectivity() != 1) ||
  1781  				(networkID != tunnel.config.GetNetworkID()) {
  1782  
  1783  				continuousNetworkConnectivity = false
  1784  			}
  1785  		}
  1786  	}
  1787  
  1788  	err = errors.Trace(err)
  1789  
  1790  	if err != nil {
  1791  		tunnel.sshClient.Close()
  1792  		tunnel.conn.Close()
  1793  
  1794  		// Don't perform log or reset actions when the keep alive may have been
  1795  		// interrupted due to shutdown.
  1796  
  1797  		isShutdown := false
  1798  		select {
  1799  		case <-tunnel.operateCtx.Done():
  1800  			isShutdown = true
  1801  		default:
  1802  		}
  1803  
  1804  		// Ensure that at most one of the two SSH keep alive workers (periodic and
  1805  		// probe) perform the log and reset actions.
  1806  
  1807  		wasHandled := atomic.CompareAndSwapInt32(&tunnel.handledSSHKeepAliveFailure, 0, 1)
  1808  
  1809  		if continuousNetworkConnectivity &&
  1810  			!isShutdown &&
  1811  			!wasHandled {
  1812  
  1813  			_ = RecordFailedTunnelStat(
  1814  				tunnel.config,
  1815  				tunnel.dialParams,
  1816  				tunnel.livenessTestMetrics,
  1817  				bytesUp,
  1818  				bytesDown,
  1819  				err)
  1820  			if tunnel.extraFailureAction != nil {
  1821  				tunnel.extraFailureAction()
  1822  			}
  1823  
  1824  			// SSHKeepAliveResetOnFailureProbability is set when a late-lifecycle
  1825  			// impaired protocol attack is suspected. With the given probability, reset
  1826  			// server affinity and replay parameters for this server to avoid
  1827  			// continuously reconnecting to the server and/or using the same protocol
  1828  			// and dial parameters.
  1829  
  1830  			if resetOnFailure {
  1831  				NoticeInfo("Delete dial parameters for %s", tunnel.dialParams.ServerEntry.GetDiagnosticID())
  1832  				err := DeleteDialParameters(tunnel.dialParams.ServerEntry.IpAddress, tunnel.dialParams.NetworkID)
  1833  				if err != nil {
  1834  					NoticeWarning("DeleteDialParameters failed: %s", err)
  1835  				}
  1836  				NoticeInfo("Delete server affinity for %s", tunnel.dialParams.ServerEntry.GetDiagnosticID())
  1837  				err = DeleteServerEntryAffinity(tunnel.dialParams.ServerEntry.IpAddress)
  1838  				if err != nil {
  1839  					NoticeWarning("DeleteServerEntryAffinity failed: %s", err)
  1840  				}
  1841  			}
  1842  		}
  1843  	}
  1844  
  1845  	return err
  1846  }
  1847  
  1848  // sendStats is a helper for sending session stats to the server.
  1849  func sendStats(tunnel *Tunnel) bool {
  1850  
  1851  	// Tunnel does not have a serverContext when DisableApi is set
  1852  	if tunnel.serverContext == nil {
  1853  		return true
  1854  	}
  1855  
  1856  	// Skip when tunnel is discarded
  1857  	if tunnel.IsDiscarded() {
  1858  		return true
  1859  	}
  1860  
  1861  	err := tunnel.serverContext.DoStatusRequest(tunnel)
  1862  	if err != nil {
  1863  		NoticeWarning("DoStatusRequest failed for %s: %s",
  1864  			tunnel.dialParams.ServerEntry.GetDiagnosticID(), err)
  1865  	}
  1866  
  1867  	return err == nil
  1868  }