github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3em/agent/agent.go (about) 1 // Copyright (c) 2017 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package agent 22 23 import ( 24 "bufio" 25 "fmt" 26 "io" 27 "io/ioutil" 28 "os" 29 "path" 30 "strings" 31 "sync" 32 "sync/atomic" 33 "time" 34 35 "github.com/m3db/m3/src/m3em/checksum" 36 "github.com/m3db/m3/src/m3em/generated/proto/m3em" 37 "github.com/m3db/m3/src/m3em/os/exec" 38 "github.com/m3db/m3/src/m3em/os/fs" 39 xerrors "github.com/m3db/m3/src/x/errors" 40 41 "github.com/uber-go/tally" 42 "go.uber.org/zap" 43 context "golang.org/x/net/context" 44 "google.golang.org/grpc" 45 "google.golang.org/grpc/codes" 46 ) 47 48 const ( 49 defaultReportInterval = 5 * time.Second 50 defaultTestCanaryPrefix = "test-canary-file" 51 reasonTeardownHeartbeat = "remote agent received Teardown(), turning off heartbeating" 52 reasonSetupInitializeHostResources = "unable to initialize host resources, turning off heartbeating" 53 ) 54 55 var ( 56 errProcessMonitorNotDefined = fmt.Errorf("process monitor not defined") 57 errNoValidTargetsSpecified = fmt.Errorf("no valid target destinations specified") 58 errOnlyDataFileMultiTarget = fmt.Errorf("multiple targets are only supported for data files") 59 ) 60 61 type opAgent struct { 62 sync.RWMutex 63 token string 64 executablePath string 65 configPath string 66 newProcessMonitorFn newProcessMonitorFn 67 processMonitor exec.ProcessMonitor 68 heartbeater *heatbeater 69 70 running int32 71 stopping int32 72 heartbeatTimeoutCh chan struct{} 73 74 opts Options 75 logger *zap.Logger 76 metrics *opAgentMetrics 77 doneCh chan struct{} 78 closeCh chan struct{} 79 } 80 81 type newProcessMonitorFn func(exec.Cmd, exec.ProcessListener) (exec.ProcessMonitor, error) 82 83 // New creates and returns a new Operator Agent 84 func New( 85 opts Options, 86 ) (Agent, error) { 87 if err := opts.Validate(); err != nil { 88 return nil, err 89 } 90 91 if err := canaryWriteTest(opts.WorkingDirectory()); err != nil { 92 return nil, err 93 } 94 95 agent := &opAgent{ 96 opts: opts, 97 logger: opts.InstrumentOptions().Logger(), 98 metrics: newAgentMetrics(opts.InstrumentOptions().MetricsScope()), 99 newProcessMonitorFn: exec.NewProcessMonitor, 100 doneCh: make(chan struct{}, 1), 101 closeCh: make(chan struct{}, 1), 102 } 103 go agent.reportMetrics() 104 return agent, nil 105 } 106 107 func (o *opAgent) Close() error { 108 o.closeCh <- struct{}{} 109 <-o.doneCh 110 return nil 111 } 112 113 func canaryWriteTest(dir string) error { 114 fi, err := os.Stat(dir) 115 if err != nil { 116 return fmt.Errorf("unable to stat directory, [ err = %v ]", err) 117 } 118 if !fi.IsDir() { 119 return fmt.Errorf("path is not a directory") 120 } 121 122 fd, err := ioutil.TempFile(dir, defaultTestCanaryPrefix) 123 if err != nil { 124 return fmt.Errorf("unable to create canary file, [ err = %v ]", err) 125 } 126 os.Remove(fd.Name()) 127 128 return nil 129 } 130 131 func updateBoolGauge(b bool, m tally.Gauge) { 132 if b { 133 m.Update(1) 134 } else { 135 m.Update(0) 136 } 137 } 138 139 func (o *opAgent) reportMetrics() { 140 reportTicker := time.NewTicker(defaultReportInterval) 141 for { 142 select { 143 case <-reportTicker.C: 144 state := o.state() 145 updateBoolGauge(state.running, o.metrics.running) 146 updateBoolGauge(state.executablePath != "", o.metrics.execTransferred) 147 updateBoolGauge(state.configPath != "", o.metrics.confTransferred) 148 case <-o.closeCh: 149 reportTicker.Stop() 150 o.doneCh <- struct{}{} 151 return 152 } 153 } 154 } 155 156 func (o *opAgent) Running() bool { 157 return atomic.LoadInt32(&o.running) == 1 158 } 159 160 type opAgentState struct { 161 running bool 162 executablePath string 163 configPath string 164 } 165 166 func (o *opAgent) state() opAgentState { 167 o.RLock() 168 defer o.RUnlock() 169 return opAgentState{ 170 running: o.Running(), 171 executablePath: o.executablePath, 172 configPath: o.configPath, 173 } 174 } 175 176 func (o *opAgent) Start(ctx context.Context, request *m3em.StartRequest) (*m3em.StartResponse, error) { 177 o.logger.Info("received Start()") 178 o.Lock() 179 defer o.Unlock() 180 181 if o.Running() { 182 return nil, grpc.Errorf(codes.FailedPrecondition, "already running") 183 } 184 185 if o.executablePath == "" { 186 return nil, grpc.Errorf(codes.FailedPrecondition, "agent missing build") 187 } 188 189 if o.configPath == "" { 190 return nil, grpc.Errorf(codes.FailedPrecondition, "agent missing config") 191 } 192 193 if err := o.startWithLock(); err != nil { 194 return nil, grpc.Errorf(codes.Internal, "unable to start: %v", err) 195 } 196 197 return &m3em.StartResponse{}, nil 198 } 199 200 func (o *opAgent) onProcessTerminate(err error) { 201 if err == nil { 202 err = fmt.Errorf("test process terminated without error") 203 } else { 204 err = fmt.Errorf("test process terminated with error: %v", err) 205 } 206 o.logger.Warn(err.Error()) 207 if stopping := atomic.LoadInt32(&o.stopping); stopping == 0 && o.heartbeater != nil { 208 o.heartbeater.notifyProcessTermination(err.Error()) 209 } 210 atomic.StoreInt32(&o.running, 0) 211 } 212 213 func (o *opAgent) newProcessListener() exec.ProcessListener { 214 return exec.NewProcessListener(func() { 215 o.onProcessTerminate(nil) 216 }, func(err error) { 217 o.onProcessTerminate(err) 218 }) 219 } 220 221 func (o *opAgent) startWithLock() error { 222 var ( 223 path, args = o.opts.ExecGenFn()(o.executablePath, o.configPath) 224 osArgs = append([]string{path}, args...) 225 cmd = exec.Cmd{ 226 Path: path, 227 Args: osArgs, 228 OutputDir: o.opts.WorkingDirectory(), 229 Env: o.opts.EnvMap(), 230 } 231 listener = o.newProcessListener() 232 ) 233 pm, err := o.newProcessMonitorFn(cmd, listener) 234 if err != nil { 235 return err 236 } 237 o.logger.Info("executing command", zap.Any("command", cmd)) 238 if err := pm.Start(); err != nil { 239 return err 240 } 241 atomic.StoreInt32(&o.running, 1) 242 o.processMonitor = pm 243 return nil 244 } 245 246 func (o *opAgent) Stop(ctx context.Context, request *m3em.StopRequest) (*m3em.StopResponse, error) { 247 o.logger.Info("received Stop()") 248 o.Lock() 249 defer o.Unlock() 250 251 if !o.Running() { 252 return nil, grpc.Errorf(codes.FailedPrecondition, "not running") 253 } 254 255 atomic.StoreInt32(&o.stopping, 1) 256 if err := o.stopWithLock(); err != nil { 257 return nil, grpc.Errorf(codes.Internal, "unable to stop: %v", err) 258 } 259 atomic.StoreInt32(&o.stopping, 0) 260 261 return &m3em.StopResponse{}, nil 262 } 263 264 func (o *opAgent) stopWithLock() error { 265 if o.processMonitor == nil { 266 return errProcessMonitorNotDefined 267 } 268 269 if err := o.processMonitor.Stop(); err != nil { 270 return err 271 } 272 273 o.processMonitor = nil 274 atomic.StoreInt32(&o.running, 0) 275 return nil 276 } 277 278 func (o *opAgent) resetWithLock(reason string) error { 279 var multiErr xerrors.MultiError 280 281 if o.heartbeater != nil { 282 o.logger.Info("stopping heartbeating") 283 if reason != "" { 284 o.heartbeater.notifyOverwrite(reason) 285 } 286 multiErr = multiErr.Add(o.heartbeater.close()) 287 o.heartbeater = nil 288 } 289 290 if o.heartbeatTimeoutCh != nil { 291 close(o.heartbeatTimeoutCh) 292 o.heartbeatTimeoutCh = nil 293 } 294 295 if o.Running() { 296 o.logger.Info("process running, stopping") 297 if err := o.stopWithLock(); err != nil { 298 o.logger.Warn("unable to stop", zap.Error(err)) 299 multiErr = multiErr.Add(err) 300 } 301 } 302 303 o.logger.Info("releasing host resources") 304 if err := o.opts.ReleaseHostResourcesFn()(); err != nil { 305 o.logger.Info("unable to release host resources", zap.Error(err)) 306 multiErr = multiErr.Add(err) 307 } 308 309 o.token = "" 310 o.executablePath = "" 311 o.configPath = "" 312 atomic.StoreInt32(&o.running, 0) 313 314 return multiErr.FinalError() 315 } 316 317 func (o *opAgent) Teardown(ctx context.Context, request *m3em.TeardownRequest) (*m3em.TeardownResponse, error) { 318 o.logger.Info("received Teardown()") 319 o.Lock() 320 defer o.Unlock() 321 322 if err := o.resetWithLock(reasonTeardownHeartbeat); err != nil { 323 return nil, grpc.Errorf(codes.Internal, "unable to teardown: %v", err) 324 } 325 326 return &m3em.TeardownResponse{}, nil 327 } 328 329 func (o *opAgent) isSetup() bool { 330 o.RLock() 331 defer o.RUnlock() 332 return o.isSetupWithLock() 333 } 334 335 func (o *opAgent) isSetupWithLock() bool { 336 return o.token != "" 337 } 338 339 func (o *opAgent) Setup(ctx context.Context, request *m3em.SetupRequest) (*m3em.SetupResponse, error) { 340 o.logger.Info("received Setup()") 341 342 // nil check 343 if request == nil || request.SessionToken == "" { 344 return nil, grpc.Errorf(codes.InvalidArgument, "nil request") 345 } 346 347 o.Lock() 348 defer o.Unlock() 349 350 if o.token != "" && o.token != request.SessionToken && !request.Force { 351 return nil, grpc.Errorf(codes.AlreadyExists, "agent already initialized with token: %s", o.token) 352 } 353 354 if o.isSetupWithLock() { 355 // reset agent 356 msg := fmt.Sprintf("heartbeating being overwritten by new setup request: %+v", *request) 357 if err := o.resetWithLock(msg); err != nil { 358 return nil, grpc.Errorf(codes.Aborted, "unable to reset: %v", err) 359 } 360 } 361 362 // remove any files stored in the working directory 363 wd := o.opts.WorkingDirectory() 364 o.logger.Info("removing contents from working directory", zap.String("dir", wd)) 365 if err := fs.RemoveContents(wd); err != nil { 366 return nil, grpc.Errorf(codes.Internal, "unable to clear working directory: %v", err) 367 } 368 369 // initialize any resources needed on the host 370 o.logger.Info("initializing host resources") 371 if err := o.opts.InitHostResourcesFn()(); err != nil { 372 o.resetWithLock(reasonSetupInitializeHostResources) // release any resources 373 return nil, grpc.Errorf(codes.Internal, "unable to initialize host resources: %v", err) 374 } 375 376 // setup new heartbeating 377 if request.HeartbeatEnabled { 378 opts := heartbeatOpts{ 379 operatorUUID: request.OperatorUuid, 380 endpoint: request.HeartbeatEndpoint, 381 nowFn: o.opts.NowFn(), 382 timeout: o.opts.HeartbeatTimeout(), 383 timeoutFn: o.heartbeatingTimeout, 384 errorFn: o.heartbeatInternalError, 385 } 386 beater, err := newHeartbeater(o, opts, o.opts.InstrumentOptions()) 387 if err != nil { 388 o.resetWithLock(reasonSetupInitializeHostResources) // release any resources 389 return nil, grpc.Errorf(codes.Aborted, "unable to start heartbeating process: %v", err) 390 } 391 o.heartbeater = beater 392 o.heartbeater.start(time.Second * time.Duration(request.HeartbeatFrequencySecs)) 393 } 394 395 o.token = request.SessionToken 396 return &m3em.SetupResponse{}, nil 397 } 398 399 func (o *opAgent) heartbeatingTimeout(lastHb time.Time) { 400 o.logger.Warn("heartbeat sending timed out, resetting agent") 401 o.Lock() 402 err := o.resetWithLock("") // "" indicates we don't want to send a heartbeat 403 o.Unlock() 404 if err == nil { 405 o.logger.Info("successfully reset agent") 406 } else { 407 o.logger.Warn("error while resetting agent", zap.Error(err)) 408 } 409 } 410 411 func (o *opAgent) heartbeatInternalError(err error) { 412 o.logger.Warn("received unknown error whilst heartbeat", zap.Error(err)) 413 o.logger.Warn("resetting agent") 414 o.Lock() 415 err = o.resetWithLock(err.Error()) 416 o.Unlock() 417 if err == nil { 418 o.logger.Info("successfully reset agent") 419 } else { 420 o.logger.Warn("error while resetting agent", zap.Error(err)) 421 } 422 } 423 424 func (o *opAgent) pathsRelativeToWorkingDir( 425 targets []string, 426 ) ([]string, error) { 427 files := make([]string, 0, len(targets)) 428 for _, t := range targets { 429 if strings.Contains(t, "..") { // i.e. relative path 430 return nil, fmt.Errorf("relative paths not allowed: %v", t) 431 } 432 f := path.Join(o.opts.WorkingDirectory(), t) 433 files = append(files, f) 434 } 435 return files, nil 436 } 437 438 func (o *opAgent) initFile( 439 fileType m3em.PushFileType, 440 targets []string, 441 overwrite bool, 442 ) (*multiWriter, error) { 443 if len(targets) < 1 { 444 return nil, errNoValidTargetsSpecified 445 } 446 447 if len(targets) > 1 && fileType != m3em.PushFileType_PUSH_FILE_TYPE_DATA_FILE { 448 return nil, errOnlyDataFileMultiTarget 449 } 450 451 paths, err := o.pathsRelativeToWorkingDir(targets) 452 if err != nil { 453 return nil, err 454 } 455 456 flags := os.O_CREATE | os.O_WRONLY 457 if overwrite { 458 flags = flags | os.O_TRUNC 459 } 460 461 fileMode := o.opts.NewFileMode() 462 if fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_BINARY { 463 fileMode = os.FileMode(0755) 464 } 465 466 dirMode := o.opts.NewDirectoryMode() 467 return newMultiWriter(paths, flags, fileMode, dirMode) 468 } 469 470 func (o *opAgent) markFileDone( 471 fileType m3em.PushFileType, 472 mw *multiWriter, 473 ) error { 474 if len(mw.fds) != 1 && (fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_BINARY || fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_CONFIG) { 475 // should never happen 476 return fmt.Errorf("internal error: multiple targets for binary/config") 477 } 478 479 for _, fd := range mw.fds { 480 o.logger.Info("file transferred", 481 zap.Stringer("type", fileType), 482 zap.String("path", fd.Name())) 483 } 484 485 o.Lock() 486 defer o.Unlock() 487 488 if fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_BINARY { 489 o.executablePath = mw.fds[0].Name() 490 } 491 492 if fileType == m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_CONFIG { 493 o.configPath = mw.fds[0].Name() 494 } 495 496 return nil 497 } 498 499 // PullFile receives a file from the caller to be stored locally on the agent 500 func (o *opAgent) PushFile(stream m3em.Operator_PushFileServer) error { 501 o.logger.Info("received PushFile()") 502 var ( 503 checksum = checksum.NewAccumulator() 504 numChunks = 0 505 lastChunkIdx = int32(0) 506 fileHandle *multiWriter 507 fileType = m3em.PushFileType_PUSH_FILE_TYPE_UNKNOWN 508 err error 509 ) 510 511 for { 512 request, streamErr := stream.Recv() 513 if streamErr != nil && streamErr != io.EOF { 514 return streamErr 515 } 516 517 if request == nil { 518 break 519 } 520 521 if numChunks == 0 { 522 // first request with any data in it, log it for visibilty 523 o.logger.Info("file transfer initiated", 524 zap.Strings("targets", request.GetTargetPaths()), 525 zap.Stringer("fileType", request.GetType()), 526 zap.Bool("overwrite", request.GetOverwrite())) 527 528 fileType = request.GetType() 529 fileHandle, err = o.initFile(fileType, request.GetTargetPaths(), request.GetOverwrite()) 530 if err != nil { 531 return err 532 } 533 lastChunkIdx = request.GetData().GetIdx() - 1 534 } 535 536 chunkIdx := request.GetData().GetIdx() 537 if chunkIdx != 1+lastChunkIdx { 538 return fmt.Errorf("received chunkIdx: %d after %d", chunkIdx, lastChunkIdx) 539 } 540 lastChunkIdx = chunkIdx 541 542 numChunks++ 543 bytes := request.GetData().GetBytes() 544 checksum.Update(bytes) 545 546 numWritten, err := fileHandle.write(bytes) 547 if err != nil { 548 return err 549 } 550 551 if numWritten != len(bytes) { 552 return fmt.Errorf("unable to write bytes, expected: %d, observed: %d", len(bytes), numWritten) 553 } 554 555 if streamErr == io.EOF { 556 break 557 } 558 } 559 560 if fileHandle == nil { 561 return fmt.Errorf("multiwriter has not been initialized") 562 } 563 564 var me xerrors.MultiError 565 me = me.Add(fileHandle.Close()) 566 me = me.Add(o.markFileDone(fileType, fileHandle)) 567 if err := me.FinalError(); err != nil { 568 return err 569 } 570 571 return stream.SendAndClose(&m3em.PushFileResponse{ 572 FileChecksum: checksum.Current(), 573 NumChunksRecvd: int32(numChunks), 574 }) 575 } 576 577 func validatePullFileRequest(request *m3em.PullFileRequest) error { 578 if request == nil { 579 return grpc.Errorf(codes.InvalidArgument, "nil request") 580 } 581 582 if request.ChunkSize <= 0 { 583 return grpc.Errorf(codes.InvalidArgument, "chunkSize must be a positive integer") 584 } 585 586 if request.MaxSize < 0 { 587 return grpc.Errorf(codes.InvalidArgument, "maxSize must be a non-negative integer") 588 } 589 590 return nil 591 } 592 593 // PullFile sends a local agent file to the caller 594 func (o *opAgent) PullFile(request *m3em.PullFileRequest, stream m3em.Operator_PullFileServer) error { 595 if err := validatePullFileRequest(request); err != nil { 596 return err 597 } 598 o.logger.Info("received PullFile()", zap.Any("request", *request)) 599 600 o.RLock() 601 defer o.RUnlock() 602 603 if !o.isSetupWithLock() { 604 return grpc.Errorf(codes.InvalidArgument, "agent has not been setup, unable to transfer file") 605 } 606 607 pm := o.processMonitor 608 if pm == nil { 609 return grpc.Errorf(codes.InvalidArgument, "no process running, unable to transfer file") 610 } 611 612 switch fileType := request.GetFileType(); fileType { 613 case m3em.PullFileType_PULL_FILE_TYPE_SERVICE_STDERR: 614 return o.sendLocalFileWithRLock(pm.StderrPath(), request.ChunkSize, request.MaxSize, stream) 615 616 case m3em.PullFileType_PULL_FILE_TYPE_SERVICE_STDOUT: 617 return o.sendLocalFileWithRLock(pm.StdoutPath(), request.ChunkSize, request.MaxSize, stream) 618 619 default: 620 return grpc.Errorf(codes.InvalidArgument, "received unknown pull file: %v", fileType) 621 } 622 } 623 624 func (o *opAgent) sendLocalFileWithRLock(localPath string, chunkSize int64, maxBytes int64, stream m3em.Operator_PullFileServer) error { 625 fi, err := os.Stat(localPath) 626 if err != nil { 627 return grpc.Errorf(codes.InvalidArgument, "unable to find file: %v", err) 628 } 629 630 fd, err := os.Open(localPath) 631 if err != nil { 632 return grpc.Errorf(codes.InvalidArgument, "unable to open file: %v", err) 633 } 634 635 var ( 636 reader = bufio.NewReaderSize(fd, int(chunkSize)) 637 buf = make([]byte, chunkSize) 638 chunkIdx = 1 639 truncated = false 640 ) 641 642 // check if we need to seek ahead or if we are sending all the bytes 643 if maxBytes > 0 && fi.Size() > maxBytes { 644 offset := fi.Size() - maxBytes 645 if _, err := fd.Seek(offset, 0 /* relative to start of file */); err != nil { 646 return grpc.Errorf(codes.Internal, "unable to seek file: %v", err) 647 } 648 truncated = true 649 } 650 651 for { 652 n, err := reader.Read(buf) 653 switch err { 654 case io.EOF: 655 // i.e. streamed through the file, we can indicate we're done 656 return nil 657 658 case nil: 659 // i.e. this read succeeded, send it and continue as we can read more data 660 if streamErr := stream.Send(&m3em.PullFileResponse{ 661 Data: &m3em.DataChunk{ 662 Bytes: buf[:n], 663 Idx: int32(chunkIdx), 664 }, 665 Truncated: truncated, 666 }); streamErr != nil { 667 return grpc.Errorf(codes.Internal, "unable to send chunk: %v", streamErr.Error()) 668 } 669 670 default: 671 // i.e. something broke 672 return grpc.Errorf(codes.Unavailable, "unable to read file: %v", err.Error()) 673 } 674 675 // increment idx 676 chunkIdx++ 677 } 678 679 } 680 681 type opAgentMetrics struct { 682 // TODO(prateek): process monitor opts, metric for process uptime 683 running tally.Gauge 684 execTransferred tally.Gauge 685 confTransferred tally.Gauge 686 } 687 688 func newAgentMetrics(scope tally.Scope) *opAgentMetrics { 689 subscope := scope.SubScope("agent") 690 return &opAgentMetrics{ 691 running: subscope.Gauge("running"), 692 execTransferred: subscope.Gauge("exec_transferred"), 693 confTransferred: subscope.Gauge("conf_transferred"), 694 } 695 }