github.com/m3db/m3@v1.5.0/src/m3em/agent/agent.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package agent
    22  
    23  import (
    24  	"bufio"
    25  	"fmt"
    26  	"io"
    27  	"io/ioutil"
    28  	"os"
    29  	"path"
    30  	"strings"
    31  	"sync"
    32  	"sync/atomic"
    33  	"time"
    34  
    35  	"github.com/m3db/m3/src/m3em/checksum"
    36  	"github.com/m3db/m3/src/m3em/generated/proto/m3em"
    37  	"github.com/m3db/m3/src/m3em/os/exec"
    38  	"github.com/m3db/m3/src/m3em/os/fs"
    39  	xerrors "github.com/m3db/m3/src/x/errors"
    40  
    41  	"github.com/uber-go/tally"
    42  	"go.uber.org/zap"
    43  	context "golang.org/x/net/context"
    44  	"google.golang.org/grpc"
    45  	"google.golang.org/grpc/codes"
    46  )
    47  
    48  const (
    49  	defaultReportInterval              = 5 * time.Second
    50  	defaultTestCanaryPrefix            = "test-canary-file"
    51  	reasonTeardownHeartbeat            = "remote agent received Teardown(), turning off heartbeating"
    52  	reasonSetupInitializeHostResources = "unable to initialize host resources, turning off heartbeating"
    53  )
    54  
    55  var (
    56  	errProcessMonitorNotDefined = fmt.Errorf("process monitor not defined")
    57  	errNoValidTargetsSpecified  = fmt.Errorf("no valid target destinations specified")
    58  	errOnlyDataFileMultiTarget  = fmt.Errorf("multiple targets are only supported for data files")
    59  )
    60  
    61  type opAgent struct {
    62  	sync.RWMutex
    63  	token               string
    64  	executablePath      string
    65  	configPath          string
    66  	newProcessMonitorFn newProcessMonitorFn
    67  	processMonitor      exec.ProcessMonitor
    68  	heartbeater         *heatbeater
    69  
    70  	running            int32
    71  	stopping           int32
    72  	heartbeatTimeoutCh chan struct{}
    73  
    74  	opts    Options
    75  	logger  *zap.Logger
    76  	metrics *opAgentMetrics
    77  	doneCh  chan struct{}
    78  	closeCh chan struct{}
    79  }
    80  
    81  type newProcessMonitorFn func(exec.Cmd, exec.ProcessListener) (exec.ProcessMonitor, error)
    82  
    83  // New creates and returns a new Operator Agent
    84  func New(
    85  	opts Options,
    86  ) (Agent, error) {
    87  	if err := opts.Validate(); err != nil {
    88  		return nil, err
    89  	}
    90  
    91  	if err := canaryWriteTest(opts.WorkingDirectory()); err != nil {
    92  		return nil, err
    93  	}
    94  
    95  	agent := &opAgent{
    96  		opts:                opts,
    97  		logger:              opts.InstrumentOptions().Logger(),
    98  		metrics:             newAgentMetrics(opts.InstrumentOptions().MetricsScope()),
    99  		newProcessMonitorFn: exec.NewProcessMonitor,
   100  		doneCh:              make(chan struct{}, 1),
   101  		closeCh:             make(chan struct{}, 1),
   102  	}
   103  	go agent.reportMetrics()
   104  	return agent, nil
   105  }
   106  
   107  func (o *opAgent) Close() error {
   108  	o.closeCh <- struct{}{}
   109  	<-o.doneCh
   110  	return nil
   111  }
   112  
   113  func canaryWriteTest(dir string) error {
   114  	fi, err := os.Stat(dir)
   115  	if err != nil {
   116  		return fmt.Errorf("unable to stat directory, [ err = %v ]", err)
   117  	}
   118  	if !fi.IsDir() {
   119  		return fmt.Errorf("path is not a directory")
   120  	}
   121  
   122  	fd, err := ioutil.TempFile(dir, defaultTestCanaryPrefix)
   123  	if err != nil {
   124  		return fmt.Errorf("unable to create canary file, [ err = %v ]", err)
   125  	}
   126  	os.Remove(fd.Name())
   127  
   128  	return nil
   129  }
   130  
   131  func updateBoolGauge(b bool, m tally.Gauge) {
   132  	if b {
   133  		m.Update(1)
   134  	} else {
   135  		m.Update(0)
   136  	}
   137  }
   138  
   139  func (o *opAgent) reportMetrics() {
   140  	reportTicker := time.NewTicker(defaultReportInterval)
   141  	for {
   142  		select {
   143  		case <-reportTicker.C:
   144  			state := o.state()
   145  			updateBoolGauge(state.running, o.metrics.running)
   146  			updateBoolGauge(state.executablePath != "", o.metrics.execTransferred)
   147  			updateBoolGauge(state.configPath != "", o.metrics.confTransferred)
   148  		case <-o.closeCh:
   149  			reportTicker.Stop()
   150  			o.doneCh <- struct{}{}
   151  			return
   152  		}
   153  	}
   154  }
   155  
   156  func (o *opAgent) Running() bool {
   157  	return atomic.LoadInt32(&o.running) == 1
   158  }
   159  
   160  type opAgentState struct {
   161  	running        bool
   162  	executablePath string
   163  	configPath     string
   164  }
   165  
   166  func (o *opAgent) state() opAgentState {
   167  	o.RLock()
   168  	defer o.RUnlock()
   169  	return opAgentState{
   170  		running:        o.Running(),
   171  		executablePath: o.executablePath,
   172  		configPath:     o.configPath,
   173  	}
   174  }
   175  
   176  func (o *opAgent) Start(ctx context.Context, request *m3em.StartRequest) (*m3em.StartResponse, error) {
   177  	o.logger.Info("received Start()")
   178  	o.Lock()
   179  	defer o.Unlock()
   180  
   181  	if o.Running() {
   182  		return nil, grpc.Errorf(codes.FailedPrecondition, "already running")
   183  	}
   184  
   185  	if o.executablePath == "" {
   186  		return nil, grpc.Errorf(codes.FailedPrecondition, "agent missing build")
   187  	}
   188  
   189  	if o.configPath == "" {
   190  		return nil, grpc.Errorf(codes.FailedPrecondition, "agent missing config")
   191  	}
   192  
   193  	if err := o.startWithLock(); err != nil {
   194  		return nil, grpc.Errorf(codes.Internal, "unable to start: %v", err)
   195  	}
   196  
   197  	return &m3em.StartResponse{}, nil
   198  }
   199  
   200  func (o *opAgent) onProcessTerminate(err error) {
   201  	if err == nil {
   202  		err = fmt.Errorf("test process terminated without error")
   203  	} else {
   204  		err = fmt.Errorf("test process terminated with error: %v", err)
   205  	}
   206  	o.logger.Warn(err.Error())
   207  	if stopping := atomic.LoadInt32(&o.stopping); stopping == 0 && o.heartbeater != nil {
   208  		o.heartbeater.notifyProcessTermination(err.Error())
   209  	}
   210  	atomic.StoreInt32(&o.running, 0)
   211  }
   212  
   213  func (o *opAgent) newProcessListener() exec.ProcessListener {
   214  	return exec.NewProcessListener(func() {
   215  		o.onProcessTerminate(nil)
   216  	}, func(err error) {
   217  		o.onProcessTerminate(err)
   218  	})
   219  }
   220  
   221  func (o *opAgent) startWithLock() error {
   222  	var (
   223  		path, args = o.opts.ExecGenFn()(o.executablePath, o.configPath)
   224  		osArgs     = append([]string{path}, args...)
   225  		cmd        = exec.Cmd{
   226  			Path:      path,
   227  			Args:      osArgs,
   228  			OutputDir: o.opts.WorkingDirectory(),
   229  			Env:       o.opts.EnvMap(),
   230  		}
   231  		listener = o.newProcessListener()
   232  	)
   233  	pm, err := o.newProcessMonitorFn(cmd, listener)
   234  	if err != nil {
   235  		return err
   236  	}
   237  	o.logger.Info("executing command", zap.Any("command", cmd))
   238  	if err := pm.Start(); err != nil {
   239  		return err
   240  	}
   241  	atomic.StoreInt32(&o.running, 1)
   242  	o.processMonitor = pm
   243  	return nil
   244  }
   245  
   246  func (o *opAgent) Stop(ctx context.Context, request *m3em.StopRequest) (*m3em.StopResponse, error) {
   247  	o.logger.Info("received Stop()")
   248  	o.Lock()
   249  	defer o.Unlock()
   250  
   251  	if !o.Running() {
   252  		return nil, grpc.Errorf(codes.FailedPrecondition, "not running")
   253  	}
   254  
   255  	atomic.StoreInt32(&o.stopping, 1)
   256  	if err := o.stopWithLock(); err != nil {
   257  		return nil, grpc.Errorf(codes.Internal, "unable to stop: %v", err)
   258  	}
   259  	atomic.StoreInt32(&o.stopping, 0)
   260  
   261  	return &m3em.StopResponse{}, nil
   262  }
   263  
   264  func (o *opAgent) stopWithLock() error {
   265  	if o.processMonitor == nil {
   266  		return errProcessMonitorNotDefined
   267  	}
   268  
   269  	if err := o.processMonitor.Stop(); err != nil {
   270  		return err
   271  	}
   272  
   273  	o.processMonitor = nil
   274  	atomic.StoreInt32(&o.running, 0)
   275  	return nil
   276  }
   277  
   278  func (o *opAgent) resetWithLock(reason string) error {
   279  	var multiErr xerrors.MultiError
   280  
   281  	if o.heartbeater != nil {
   282  		o.logger.Info("stopping heartbeating")
   283  		if reason != "" {
   284  			o.heartbeater.notifyOverwrite(reason)
   285  		}
   286  		multiErr = multiErr.Add(o.heartbeater.close())
   287  		o.heartbeater = nil
   288  	}
   289  
   290  	if o.heartbeatTimeoutCh != nil {
   291  		close(o.heartbeatTimeoutCh)
   292  		o.heartbeatTimeoutCh = nil
   293  	}
   294  
   295  	if o.Running() {
   296  		o.logger.Info("process running, stopping")
   297  		if err := o.stopWithLock(); err != nil {
   298  			o.logger.Warn("unable to stop", zap.Error(err))
   299  			multiErr = multiErr.Add(err)
   300  		}
   301  	}
   302  
   303  	o.logger.Info("releasing host resources")
   304  	if err := o.opts.ReleaseHostResourcesFn()(); err != nil {
   305  		o.logger.Info("unable to release host resources", zap.Error(err))
   306  		multiErr = multiErr.Add(err)
   307  	}
   308  
   309  	o.token = ""
   310  	o.executablePath = ""
   311  	o.configPath = ""
   312  	atomic.StoreInt32(&o.running, 0)
   313  
   314  	return multiErr.FinalError()
   315  }
   316  
   317  func (o *opAgent) Teardown(ctx context.Context, request *m3em.TeardownRequest) (*m3em.TeardownResponse, error) {
   318  	o.logger.Info("received Teardown()")
   319  	o.Lock()
   320  	defer o.Unlock()
   321  
   322  	if err := o.resetWithLock(reasonTeardownHeartbeat); err != nil {
   323  		return nil, grpc.Errorf(codes.Internal, "unable to teardown: %v", err)
   324  	}
   325  
   326  	return &m3em.TeardownResponse{}, nil
   327  }
   328  
   329  func (o *opAgent) isSetup() bool {
   330  	o.RLock()
   331  	defer o.RUnlock()
   332  	return o.isSetupWithLock()
   333  }
   334  
   335  func (o *opAgent) isSetupWithLock() bool {
   336  	return o.token != ""
   337  }
   338  
   339  func (o *opAgent) Setup(ctx context.Context, request *m3em.SetupRequest) (*m3em.SetupResponse, error) {
   340  	o.logger.Info("received Setup()")
   341  
   342  	// nil check
   343  	if request == nil || request.SessionToken == "" {
   344  		return nil, grpc.Errorf(codes.InvalidArgument, "nil request")
   345  	}
   346  
   347  	o.Lock()
   348  	defer o.Unlock()
   349  
   350  	if o.token != "" && o.token != request.SessionToken && !request.Force {
   351  		return nil, grpc.Errorf(codes.AlreadyExists, "agent already initialized with token: %s", o.token)
   352  	}
   353  
   354  	if o.isSetupWithLock() {
   355  		// reset agent
   356  		msg := fmt.Sprintf("heartbeating being overwritten by new setup request: %+v", *request)
   357  		if err := o.resetWithLock(msg); err != nil {
   358  			return nil, grpc.Errorf(codes.Aborted, "unable to reset: %v", err)
   359  		}
   360  	}
   361  
   362  	// remove any files stored in the working directory
   363  	wd := o.opts.WorkingDirectory()
   364  	o.logger.Info("removing contents from working directory", zap.String("dir", wd))
   365  	if err := fs.RemoveContents(wd); err != nil {
   366  		return nil, grpc.Errorf(codes.Internal, "unable to clear working directory: %v", err)
   367  	}
   368  
   369  	// initialize any resources needed on the host
   370  	o.logger.Info("initializing host resources")
   371  	if err := o.opts.InitHostResourcesFn()(); err != nil {
   372  		o.resetWithLock(reasonSetupInitializeHostResources) // release any resources
   373  		return nil, grpc.Errorf(codes.Internal, "unable to initialize host resources: %v", err)
   374  	}
   375  
   376  	// setup new heartbeating
   377  	if request.HeartbeatEnabled {
   378  		opts := heartbeatOpts{
   379  			operatorUUID: request.OperatorUuid,
   380  			endpoint:     request.HeartbeatEndpoint,
   381  			nowFn:        o.opts.NowFn(),
   382  			timeout:      o.opts.HeartbeatTimeout(),
   383  			timeoutFn:    o.heartbeatingTimeout,
   384  			errorFn:      o.heartbeatInternalError,
   385  		}
   386  		beater, err := newHeartbeater(o, opts, o.opts.InstrumentOptions())
   387  		if err != nil {
   388  			o.resetWithLock(reasonSetupInitializeHostResources) // release any resources
   389  			return nil, grpc.Errorf(codes.Aborted, "unable to start heartbeating process: %v", err)
   390  		}
   391  		o.heartbeater = beater
   392  		o.heartbeater.start(time.Second * time.Duration(request.HeartbeatFrequencySecs))
   393  	}
   394  
   395  	o.token = request.SessionToken
   396  	return &m3em.SetupResponse{}, nil
   397  }
   398  
   399  func (o *opAgent) heartbeatingTimeout(lastHb time.Time) {
   400  	o.logger.Warn("heartbeat sending timed out, resetting agent")
   401  	o.Lock()
   402  	err := o.resetWithLock("") // "" indicates we don't want to send a heartbeat
   403  	o.Unlock()
   404  	if err == nil {
   405  		o.logger.Info("successfully reset agent")
   406  	} else {
   407  		o.logger.Warn("error while resetting agent", zap.Error(err))
   408  	}
   409  }
   410  
   411  func (o *opAgent) heartbeatInternalError(err error) {
   412  	o.logger.Warn("received unknown error whilst heartbeat", zap.Error(err))
   413  	o.logger.Warn("resetting agent")
   414  	o.Lock()
   415  	err = o.resetWithLock(err.Error())
   416  	o.Unlock()
   417  	if err == nil {
   418  		o.logger.Info("successfully reset agent")
   419  	} else {
   420  		o.logger.Warn("error while resetting agent", zap.Error(err))
   421  	}
   422  }
   423  
   424  func (o *opAgent) pathsRelativeToWorkingDir(
   425  	targets []string,
   426  ) ([]string, error) {
   427  	files := make([]string, 0, len(targets))
   428  	for _, t := range targets {
   429  		if strings.Contains(t, "..") { // i.e. relative path
   430  			return nil, fmt.Errorf("relative paths not allowed: %v", t)
   431  		}
   432  		f := path.Join(o.opts.WorkingDirectory(), t)
   433  		files = append(files, f)
   434  	}
   435  	return files, nil
   436  }
   437  
   438  func (o *opAgent) initFile(
   439  	fileType m3em.PushFileType,
   440  	targets []string,
   441  	overwrite bool,
   442  ) (*multiWriter, error) {
   443  	if len(targets) < 1 {
   444  		return nil, errNoValidTargetsSpecified
   445  	}
   446  
   447  	if len(targets) > 1 && fileType != m3em.PushFileType_PUSH_FILE_TYPE_DATA_FILE {
   448  		return nil, errOnlyDataFileMultiTarget
   449  	}
   450  
   451  	paths, err := o.pathsRelativeToWorkingDir(targets)
   452  	if err != nil {
   453  		return nil, err
   454  	}
   455  
   456  	flags := os.O_CREATE | os.O_WRONLY
   457  	if overwrite {
   458  		flags = flags | os.O_TRUNC
   459  	}
   460  
   461  	fileMode := o.opts.NewFileMode()
   462  	if fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_BINARY {
   463  		fileMode = os.FileMode(0755)
   464  	}
   465  
   466  	dirMode := o.opts.NewDirectoryMode()
   467  	return newMultiWriter(paths, flags, fileMode, dirMode)
   468  }
   469  
   470  func (o *opAgent) markFileDone(
   471  	fileType m3em.PushFileType,
   472  	mw *multiWriter,
   473  ) error {
   474  	if len(mw.fds) != 1 && (fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_BINARY || fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_CONFIG) {
   475  		// should never happen
   476  		return fmt.Errorf("internal error: multiple targets for binary/config")
   477  	}
   478  
   479  	for _, fd := range mw.fds {
   480  		o.logger.Info("file transferred",
   481  			zap.Stringer("type", fileType),
   482  			zap.String("path", fd.Name()))
   483  	}
   484  
   485  	o.Lock()
   486  	defer o.Unlock()
   487  
   488  	if fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_BINARY {
   489  		o.executablePath = mw.fds[0].Name()
   490  	}
   491  
   492  	if fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_CONFIG {
   493  		o.configPath = mw.fds[0].Name()
   494  	}
   495  
   496  	return nil
   497  }
   498  
   499  // PullFile receives a file from the caller to be stored locally on the agent
   500  func (o *opAgent) PushFile(stream m3em.Operator_PushFileServer) error {
   501  	o.logger.Info("received PushFile()")
   502  	var (
   503  		checksum     = checksum.NewAccumulator()
   504  		numChunks    = 0
   505  		lastChunkIdx = int32(0)
   506  		fileHandle   *multiWriter
   507  		fileType     = m3em.PushFileType_PUSH_FILE_TYPE_UNKNOWN
   508  		err          error
   509  	)
   510  
   511  	for {
   512  		request, streamErr := stream.Recv()
   513  		if streamErr != nil && streamErr != io.EOF {
   514  			return streamErr
   515  		}
   516  
   517  		if request == nil {
   518  			break
   519  		}
   520  
   521  		if numChunks == 0 {
   522  			// first request with any data in it, log it for visibilty
   523  			o.logger.Info("file transfer initiated",
   524  				zap.Strings("targets", request.GetTargetPaths()),
   525  				zap.Stringer("fileType", request.GetType()),
   526  				zap.Bool("overwrite", request.GetOverwrite()))
   527  
   528  			fileType = request.GetType()
   529  			fileHandle, err = o.initFile(fileType, request.GetTargetPaths(), request.GetOverwrite())
   530  			if err != nil {
   531  				return err
   532  			}
   533  			lastChunkIdx = request.GetData().GetIdx() - 1
   534  		}
   535  
   536  		chunkIdx := request.GetData().GetIdx()
   537  		if chunkIdx != 1+lastChunkIdx {
   538  			return fmt.Errorf("received chunkIdx: %d after %d", chunkIdx, lastChunkIdx)
   539  		}
   540  		lastChunkIdx = chunkIdx
   541  
   542  		numChunks++
   543  		bytes := request.GetData().GetBytes()
   544  		checksum.Update(bytes)
   545  
   546  		numWritten, err := fileHandle.write(bytes)
   547  		if err != nil {
   548  			return err
   549  		}
   550  
   551  		if numWritten != len(bytes) {
   552  			return fmt.Errorf("unable to write bytes, expected: %d, observed: %d", len(bytes), numWritten)
   553  		}
   554  
   555  		if streamErr == io.EOF {
   556  			break
   557  		}
   558  	}
   559  
   560  	if fileHandle == nil {
   561  		return fmt.Errorf("multiwriter has not been initialized")
   562  	}
   563  
   564  	var me xerrors.MultiError
   565  	me = me.Add(fileHandle.Close())
   566  	me = me.Add(o.markFileDone(fileType, fileHandle))
   567  	if err := me.FinalError(); err != nil {
   568  		return err
   569  	}
   570  
   571  	return stream.SendAndClose(&m3em.PushFileResponse{
   572  		FileChecksum:   checksum.Current(),
   573  		NumChunksRecvd: int32(numChunks),
   574  	})
   575  }
   576  
   577  func validatePullFileRequest(request *m3em.PullFileRequest) error {
   578  	if request == nil {
   579  		return grpc.Errorf(codes.InvalidArgument, "nil request")
   580  	}
   581  
   582  	if request.ChunkSize <= 0 {
   583  		return grpc.Errorf(codes.InvalidArgument, "chunkSize must be a positive integer")
   584  	}
   585  
   586  	if request.MaxSize < 0 {
   587  		return grpc.Errorf(codes.InvalidArgument, "maxSize must be a non-negative integer")
   588  	}
   589  
   590  	return nil
   591  }
   592  
   593  // PullFile sends a local agent file to the caller
   594  func (o *opAgent) PullFile(request *m3em.PullFileRequest, stream m3em.Operator_PullFileServer) error {
   595  	if err := validatePullFileRequest(request); err != nil {
   596  		return err
   597  	}
   598  	o.logger.Info("received PullFile()", zap.Any("request", *request))
   599  
   600  	o.RLock()
   601  	defer o.RUnlock()
   602  
   603  	if !o.isSetupWithLock() {
   604  		return grpc.Errorf(codes.InvalidArgument, "agent has not been setup, unable to transfer file")
   605  	}
   606  
   607  	pm := o.processMonitor
   608  	if pm == nil {
   609  		return grpc.Errorf(codes.InvalidArgument, "no process running, unable to transfer file")
   610  	}
   611  
   612  	switch fileType := request.GetFileType(); fileType {
   613  	case m3em.PullFileType_PULL_FILE_TYPE_SERVICE_STDERR:
   614  		return o.sendLocalFileWithRLock(pm.StderrPath(), request.ChunkSize, request.MaxSize, stream)
   615  
   616  	case m3em.PullFileType_PULL_FILE_TYPE_SERVICE_STDOUT:
   617  		return o.sendLocalFileWithRLock(pm.StdoutPath(), request.ChunkSize, request.MaxSize, stream)
   618  
   619  	default:
   620  		return grpc.Errorf(codes.InvalidArgument, "received unknown pull file: %v", fileType)
   621  	}
   622  }
   623  
   624  func (o *opAgent) sendLocalFileWithRLock(localPath string, chunkSize int64, maxBytes int64, stream m3em.Operator_PullFileServer) error {
   625  	fi, err := os.Stat(localPath)
   626  	if err != nil {
   627  		return grpc.Errorf(codes.InvalidArgument, "unable to find file: %v", err)
   628  	}
   629  
   630  	fd, err := os.Open(localPath)
   631  	if err != nil {
   632  		return grpc.Errorf(codes.InvalidArgument, "unable to open file: %v", err)
   633  	}
   634  
   635  	var (
   636  		reader    = bufio.NewReaderSize(fd, int(chunkSize))
   637  		buf       = make([]byte, chunkSize)
   638  		chunkIdx  = 1
   639  		truncated = false
   640  	)
   641  
   642  	// check if we need to seek ahead or if we are sending all the bytes
   643  	if maxBytes > 0 && fi.Size() > maxBytes {
   644  		offset := fi.Size() - maxBytes
   645  		if _, err := fd.Seek(offset, 0 /* relative to start of file */); err != nil {
   646  			return grpc.Errorf(codes.Internal, "unable to seek file: %v", err)
   647  		}
   648  		truncated = true
   649  	}
   650  
   651  	for {
   652  		n, err := reader.Read(buf)
   653  		switch err {
   654  		case io.EOF:
   655  			// i.e. streamed through the file, we can indicate we're done
   656  			return nil
   657  
   658  		case nil:
   659  			// i.e. this read succeeded, send it and continue as we can read more data
   660  			if streamErr := stream.Send(&m3em.PullFileResponse{
   661  				Data: &m3em.DataChunk{
   662  					Bytes: buf[:n],
   663  					Idx:   int32(chunkIdx),
   664  				},
   665  				Truncated: truncated,
   666  			}); streamErr != nil {
   667  				return grpc.Errorf(codes.Internal, "unable to send chunk: %v", streamErr.Error())
   668  			}
   669  
   670  		default:
   671  			// i.e. something broke
   672  			return grpc.Errorf(codes.Unavailable, "unable to read file: %v", err.Error())
   673  		}
   674  
   675  		// increment idx
   676  		chunkIdx++
   677  	}
   678  
   679  }
   680  
   681  type opAgentMetrics struct {
   682  	// TODO(prateek): process monitor opts, metric for process uptime
   683  	running         tally.Gauge
   684  	execTransferred tally.Gauge
   685  	confTransferred tally.Gauge
   686  }
   687  
   688  func newAgentMetrics(scope tally.Scope) *opAgentMetrics {
   689  	subscope := scope.SubScope("agent")
   690  	return &opAgentMetrics{
   691  		running:         subscope.Gauge("running"),
   692  		execTransferred: subscope.Gauge("exec_transferred"),
   693  		confTransferred: subscope.Gauge("conf_transferred"),
   694  	}
   695  }