github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3em/node/node.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package node
    22  
    23  import (
    24  	"context"
    25  	"fmt"
    26  	"io"
    27  	"os"
    28  	"path/filepath"
    29  	"sync"
    30  
    31  	"github.com/m3db/m3/src/cluster/placement"
    32  	"github.com/m3db/m3/src/m3em/build"
    33  	"github.com/m3db/m3/src/m3em/generated/proto/m3em"
    34  	"github.com/m3db/m3/src/m3em/os/fs"
    35  	xclock "github.com/m3db/m3/src/x/clock"
    36  	xerrors "github.com/m3db/m3/src/x/errors"
    37  
    38  	"github.com/pborman/uuid"
    39  	"go.uber.org/zap"
    40  	"google.golang.org/grpc"
    41  )
    42  
    43  var (
    44  	errUnableToSetupInitializedNode = fmt.Errorf("unable to setup node, must be either setup/uninitialized")
    45  	errUnableToTeardownNode         = fmt.Errorf("unable to teardown node, must be either setup/running")
    46  	errUnableToStartNode            = fmt.Errorf("unable to start node, it must be setup")
    47  	errUnableToStopNode             = fmt.Errorf("unable to stop node, it must be running")
    48  	errUnableToTransferFile         = fmt.Errorf("unable to transfer file. node must be setup/running")
    49  )
    50  
    51  type svcNode struct {
    52  	sync.Mutex
    53  	placement.Instance
    54  	logger            *zap.Logger
    55  	opts              Options
    56  	status            Status
    57  	currentBuild      build.ServiceBuild
    58  	currentConf       build.ServiceConfiguration
    59  	clientConn        *grpc.ClientConn
    60  	client            m3em.OperatorClient
    61  	listeners         *listenerGroup
    62  	heartbeater       *opHeartbeatServer
    63  	operatorUUID      string
    64  	heartbeatEndpoint string
    65  }
    66  
    67  // New returns a new ServiceNode.
    68  func New(
    69  	node placement.Instance,
    70  	opts Options,
    71  ) (ServiceNode, error) {
    72  	if err := opts.Validate(); err != nil {
    73  		return nil, err
    74  	}
    75  
    76  	clientConn, client, err := opts.OperatorClientFn()()
    77  	if err != nil {
    78  		return nil, err
    79  	}
    80  
    81  	uuid := uuid.NewRandom()
    82  
    83  	var (
    84  		retNode = &svcNode{
    85  			logger:   opts.InstrumentOptions().Logger(),
    86  			opts:     opts,
    87  			Instance: node,
    88  			status:   StatusUninitialized,
    89  		}
    90  		listeners      = newListenerGroup(retNode)
    91  		hbUUID         = uuid.String()
    92  		heartbeater    *opHeartbeatServer
    93  		routerEndpoint string
    94  	)
    95  
    96  	if opts.HeartbeatOptions().Enabled() {
    97  		router := opts.HeartbeatOptions().HeartbeatRouter()
    98  		routerEndpoint = router.Endpoint()
    99  		heartbeater = newHeartbeater(listeners, opts.HeartbeatOptions(), opts.InstrumentOptions())
   100  		if err := router.Register(hbUUID, heartbeater); err != nil {
   101  			return nil, fmt.Errorf("unable to register heartbeat server with router: %v", err)
   102  		}
   103  	}
   104  
   105  	retNode.listeners = listeners
   106  	retNode.client = client
   107  	retNode.clientConn = clientConn
   108  	retNode.heartbeater = heartbeater
   109  	retNode.heartbeatEndpoint = routerEndpoint
   110  	retNode.operatorUUID = hbUUID
   111  	return retNode, nil
   112  }
   113  
   114  func (i *svcNode) String() string {
   115  	i.Lock()
   116  	defer i.Unlock()
   117  	return fmt.Sprintf("ServiceNode %s", i.Instance.String())
   118  }
   119  
   120  func (i *svcNode) heartbeatReceived() bool {
   121  	return !i.heartbeater.lastHeartbeatTime().IsZero()
   122  }
   123  
   124  func (i *svcNode) Setup(
   125  	bld build.ServiceBuild,
   126  	conf build.ServiceConfiguration,
   127  	token string,
   128  	force bool,
   129  ) error {
   130  	i.Lock()
   131  	defer i.Unlock()
   132  	if i.status != StatusUninitialized &&
   133  		i.status != StatusSetup {
   134  		return errUnableToSetupInitializedNode
   135  	}
   136  
   137  	i.currentConf = conf
   138  	i.currentBuild = bld
   139  
   140  	freq := uint32(i.opts.HeartbeatOptions().Interval().Seconds())
   141  	err := i.opts.Retrier().Attempt(func() error {
   142  		ctx := context.Background()
   143  		_, err := i.client.Setup(ctx, &m3em.SetupRequest{
   144  			OperatorUuid:           i.operatorUUID,
   145  			SessionToken:           token,
   146  			Force:                  force,
   147  			HeartbeatEnabled:       i.opts.HeartbeatOptions().Enabled(),
   148  			HeartbeatEndpoint:      i.heartbeatEndpoint,
   149  			HeartbeatFrequencySecs: freq,
   150  		})
   151  		return err
   152  	})
   153  
   154  	if err != nil {
   155  		return fmt.Errorf("unable to setup: %v", err)
   156  	}
   157  
   158  	// TODO(prateek): make heartbeat pickup existing agent state
   159  
   160  	// Wait till we receive our first heartbeat
   161  	if i.opts.HeartbeatOptions().Enabled() {
   162  		i.logger.Info("waiting until initial heartbeat is received")
   163  		received := xclock.WaitUntil(i.heartbeatReceived, i.opts.HeartbeatOptions().Timeout())
   164  		if !received {
   165  			return fmt.Errorf("did not receive heartbeat response from remote agent within timeout")
   166  		}
   167  		i.logger.Info("initial heartbeat received")
   168  
   169  		// start hb monitoring
   170  		if err := i.heartbeater.start(); err != nil {
   171  			return fmt.Errorf("unable to start heartbeat monitor loop: %v", err)
   172  		}
   173  	}
   174  
   175  	// transfer build
   176  	if err := i.opts.Retrier().Attempt(func() error {
   177  		iter, err := bld.Iter(i.opts.TransferBufferSize())
   178  		if err != nil {
   179  			return err
   180  		}
   181  		return i.transferFile(transferOpts{
   182  			targets:   []string{bld.ID()},
   183  			fileType:  m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_BINARY,
   184  			overwrite: force,
   185  			iter:      iter,
   186  		})
   187  	}); err != nil {
   188  		return fmt.Errorf("unable to transfer build: %v", err)
   189  	}
   190  
   191  	if err := i.opts.Retrier().Attempt(func() error {
   192  		iter, err := conf.Iter(i.opts.TransferBufferSize())
   193  		if err != nil {
   194  			return err
   195  		}
   196  		return i.transferFile(transferOpts{
   197  			targets:   []string{conf.ID()},
   198  			fileType:  m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_CONFIG,
   199  			overwrite: force,
   200  			iter:      iter,
   201  		})
   202  	}); err != nil {
   203  		return fmt.Errorf("unable to transfer config: %v", err)
   204  	}
   205  
   206  	i.status = StatusSetup
   207  	return nil
   208  }
   209  
   210  // nolint: maligned
   211  type transferOpts struct {
   212  	targets   []string
   213  	fileType  m3em.PushFileType
   214  	iter      fs.FileReaderIter
   215  	overwrite bool
   216  }
   217  
   218  func (i *svcNode) transferFile(
   219  	t transferOpts,
   220  ) error {
   221  	defer t.iter.Close()
   222  	ctx := context.Background()
   223  	stream, err := i.client.PushFile(ctx)
   224  	if err != nil {
   225  		return err
   226  	}
   227  	chunkIdx := 0
   228  	for ; t.iter.Next(); chunkIdx++ {
   229  		bytes := t.iter.Current()
   230  		request := &m3em.PushFileRequest{
   231  			Type:        t.fileType,
   232  			TargetPaths: t.targets,
   233  			Overwrite:   t.overwrite,
   234  			Data: &m3em.DataChunk{
   235  				Bytes: bytes,
   236  				Idx:   int32(chunkIdx),
   237  			},
   238  		}
   239  		err := stream.Send(request)
   240  		if err != nil {
   241  			stream.CloseSend()
   242  			return err
   243  		}
   244  	}
   245  	if err := t.iter.Err(); err != nil {
   246  		stream.CloseSend()
   247  		return err
   248  	}
   249  
   250  	response, err := stream.CloseAndRecv()
   251  	if err != nil {
   252  		return err
   253  	}
   254  
   255  	if int(response.NumChunksRecvd) != chunkIdx {
   256  		return fmt.Errorf("sent %d chunks, server only received %d of them", chunkIdx, response.NumChunksRecvd)
   257  	}
   258  
   259  	if t.iter.Checksum() != response.FileChecksum {
   260  		return fmt.Errorf("expected file checksum: %d, received: %d", t.iter.Checksum(), response.FileChecksum)
   261  	}
   262  
   263  	return nil
   264  }
   265  
   266  func (i *svcNode) TransferLocalFile(
   267  	srcPath string,
   268  	destPaths []string,
   269  	overwrite bool,
   270  ) error {
   271  	i.Lock()
   272  	defer i.Unlock()
   273  
   274  	if i.status != StatusSetup && i.status != StatusRunning {
   275  		return errUnableToTransferFile
   276  	}
   277  
   278  	if err := i.opts.Retrier().Attempt(func() error {
   279  		iter, err := fs.NewSizedFileReaderIter(srcPath, i.opts.TransferBufferSize())
   280  		if err != nil {
   281  			return err
   282  		}
   283  		return i.transferFile(transferOpts{
   284  			targets:   destPaths,
   285  			fileType:  m3em.PushFileType_PUSH_FILE_TYPE_DATA_FILE,
   286  			overwrite: overwrite,
   287  			iter:      iter,
   288  		})
   289  	}); err != nil {
   290  		return fmt.Errorf("unable to transfer file: %v", err)
   291  	}
   292  
   293  	return nil
   294  }
   295  
   296  func (i *svcNode) pullRemoteFile(t m3em.PullFileType, fd *os.File) (bool, error) {
   297  	ctx := context.Background()
   298  
   299  	// resetting file in case this a retry
   300  	if err := fd.Truncate(0); err != nil {
   301  		return false, err
   302  	}
   303  
   304  	// create streaming client
   305  	client, err := i.client.PullFile(ctx, &m3em.PullFileRequest{
   306  		ChunkSize: int64(i.opts.TransferBufferSize()),
   307  		MaxSize:   i.opts.MaxPullSize(),
   308  		FileType:  t,
   309  	})
   310  	if err != nil {
   311  		return false, err
   312  	}
   313  
   314  	// iterate through responses
   315  	truncated := false
   316  	for {
   317  		response, err := client.Recv()
   318  		switch err {
   319  		case nil: // this Recv was successful, and we have more to read
   320  			truncated = response.Truncated
   321  			if _, writeErr := fd.Write(response.Data.Bytes); writeErr != nil {
   322  				return truncated, writeErr
   323  			}
   324  
   325  		case io.EOF: // no more to read, indicate success
   326  			return truncated, nil
   327  
   328  		default: // unexpected error, indicate failure
   329  			return truncated, err
   330  		}
   331  	}
   332  }
   333  
   334  func toM3EMPullType(t RemoteOutputType) (m3em.PullFileType, error) {
   335  	switch t {
   336  	case RemoteProcessStderr:
   337  		return m3em.PullFileType_PULL_FILE_TYPE_SERVICE_STDERR, nil
   338  
   339  	case RemoteProcessStdout:
   340  		return m3em.PullFileType_PULL_FILE_TYPE_SERVICE_STDOUT, nil
   341  
   342  	default:
   343  		return m3em.PullFileType_PULL_FILE_TYPE_UNKNOWN, fmt.Errorf("unknown output type: %v", t)
   344  	}
   345  }
   346  
   347  func (i *svcNode) GetRemoteOutput(
   348  	t RemoteOutputType,
   349  	localDest string,
   350  ) (bool, error) {
   351  	i.Lock()
   352  	defer i.Unlock()
   353  
   354  	if i.status != StatusSetup && i.status != StatusRunning {
   355  		return false, errUnableToTransferFile
   356  	}
   357  
   358  	mType, err := toM3EMPullType(t)
   359  	if err != nil {
   360  		return false, err
   361  	}
   362  
   363  	// create base directory for specified remote path if it doesn't exist
   364  	base := filepath.Dir(localDest)
   365  	if err := os.MkdirAll(base, os.FileMode(0755)|os.ModeDir); err != nil {
   366  		return false, err
   367  	}
   368  
   369  	fd, err := os.OpenFile(localDest, os.O_CREATE|os.O_WRONLY, os.FileMode(0666))
   370  	if err != nil {
   371  		return false, err
   372  	}
   373  
   374  	truncated := false
   375  	if retryErr := i.opts.Retrier().Attempt(func() error {
   376  		truncated, err = i.pullRemoteFile(mType, fd)
   377  		return err
   378  	}); retryErr != nil {
   379  		return truncated, fmt.Errorf("unable to get remote output: %v", retryErr)
   380  	}
   381  
   382  	return truncated, fd.Close()
   383  }
   384  
   385  func (i *svcNode) Teardown() error {
   386  	i.Lock()
   387  	defer i.Unlock()
   388  	if status := i.status; status != StatusRunning &&
   389  		status != StatusSetup &&
   390  		status != StatusError {
   391  		return errUnableToTeardownNode
   392  	}
   393  
   394  	// clear any listeners
   395  	i.listeners.clear()
   396  
   397  	if err := i.opts.Retrier().Attempt(func() error {
   398  		ctx := context.Background()
   399  		_, err := i.client.Teardown(ctx, &m3em.TeardownRequest{})
   400  		return err
   401  	}); err != nil {
   402  		return err
   403  	}
   404  
   405  	if err := i.Close(); err != nil {
   406  		return err
   407  	}
   408  
   409  	i.status = StatusUninitialized
   410  	return nil
   411  }
   412  
   413  func (i *svcNode) Close() error {
   414  	var err xerrors.MultiError
   415  
   416  	if conn := i.clientConn; conn != nil {
   417  		err = err.Add(conn.Close())
   418  		i.clientConn = nil
   419  	}
   420  
   421  	if hbServer := i.heartbeater; hbServer != nil {
   422  		hbServer.stop()
   423  		err = err.Add(i.opts.HeartbeatOptions().HeartbeatRouter().Deregister(i.operatorUUID))
   424  		i.heartbeater = nil
   425  		i.operatorUUID = ""
   426  	}
   427  
   428  	return err.FinalError()
   429  }
   430  
   431  func (i *svcNode) Start() error {
   432  	i.Lock()
   433  	defer i.Unlock()
   434  	if i.status != StatusSetup {
   435  		return errUnableToStartNode
   436  	}
   437  
   438  	if err := i.opts.Retrier().Attempt(func() error {
   439  		ctx := context.Background()
   440  		_, err := i.client.Start(ctx, &m3em.StartRequest{})
   441  		return err
   442  	}); err != nil {
   443  		return err
   444  	}
   445  
   446  	i.status = StatusRunning
   447  	return nil
   448  }
   449  
   450  func (i *svcNode) Stop() error {
   451  	i.Lock()
   452  	defer i.Unlock()
   453  	if i.status != StatusRunning {
   454  		return errUnableToStopNode
   455  	}
   456  
   457  	if err := i.opts.Retrier().Attempt(func() error {
   458  		ctx := context.Background()
   459  		_, err := i.client.Stop(ctx, &m3em.StopRequest{})
   460  		return err
   461  	}); err != nil {
   462  		return err
   463  	}
   464  
   465  	i.status = StatusSetup
   466  	return nil
   467  }
   468  
   469  func (i *svcNode) Status() Status {
   470  	i.Lock()
   471  	defer i.Unlock()
   472  	return i.status
   473  }
   474  
   475  func (i *svcNode) RegisterListener(l Listener) ListenerID {
   476  	return ListenerID(i.listeners.add(l))
   477  }
   478  
   479  func (i *svcNode) DeregisterListener(token ListenerID) {
   480  	i.listeners.remove(int(token))
   481  }