github.com/m3db/m3@v1.5.0/src/m3em/node/node.go (about) 1 // Copyright (c) 2017 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package node 22 23 import ( 24 "context" 25 "fmt" 26 "io" 27 "os" 28 "path/filepath" 29 "sync" 30 31 "github.com/m3db/m3/src/cluster/placement" 32 "github.com/m3db/m3/src/m3em/build" 33 "github.com/m3db/m3/src/m3em/generated/proto/m3em" 34 "github.com/m3db/m3/src/m3em/os/fs" 35 xclock "github.com/m3db/m3/src/x/clock" 36 xerrors "github.com/m3db/m3/src/x/errors" 37 38 "github.com/pborman/uuid" 39 "go.uber.org/zap" 40 "google.golang.org/grpc" 41 ) 42 43 var ( 44 errUnableToSetupInitializedNode = fmt.Errorf("unable to setup node, must be either setup/uninitialized") 45 errUnableToTeardownNode = fmt.Errorf("unable to teardown node, must be either setup/running") 46 errUnableToStartNode = fmt.Errorf("unable to start node, it must be setup") 47 errUnableToStopNode = fmt.Errorf("unable to stop node, it must be running") 48 errUnableToTransferFile = fmt.Errorf("unable to transfer file. node must be setup/running") 49 ) 50 51 type svcNode struct { 52 sync.Mutex 53 placement.Instance 54 logger *zap.Logger 55 opts Options 56 status Status 57 currentBuild build.ServiceBuild 58 currentConf build.ServiceConfiguration 59 clientConn *grpc.ClientConn 60 client m3em.OperatorClient 61 listeners *listenerGroup 62 heartbeater *opHeartbeatServer 63 operatorUUID string 64 heartbeatEndpoint string 65 } 66 67 // New returns a new ServiceNode. 68 func New( 69 node placement.Instance, 70 opts Options, 71 ) (ServiceNode, error) { 72 if err := opts.Validate(); err != nil { 73 return nil, err 74 } 75 76 clientConn, client, err := opts.OperatorClientFn()() 77 if err != nil { 78 return nil, err 79 } 80 81 uuid := uuid.NewRandom() 82 83 var ( 84 retNode = &svcNode{ 85 logger: opts.InstrumentOptions().Logger(), 86 opts: opts, 87 Instance: node, 88 status: StatusUninitialized, 89 } 90 listeners = newListenerGroup(retNode) 91 hbUUID = uuid.String() 92 heartbeater *opHeartbeatServer 93 routerEndpoint string 94 ) 95 96 if opts.HeartbeatOptions().Enabled() { 97 router := opts.HeartbeatOptions().HeartbeatRouter() 98 routerEndpoint = router.Endpoint() 99 heartbeater = newHeartbeater(listeners, opts.HeartbeatOptions(), opts.InstrumentOptions()) 100 if err := router.Register(hbUUID, heartbeater); err != nil { 101 return nil, fmt.Errorf("unable to register heartbeat server with router: %v", err) 102 } 103 } 104 105 retNode.listeners = listeners 106 retNode.client = client 107 retNode.clientConn = clientConn 108 retNode.heartbeater = heartbeater 109 retNode.heartbeatEndpoint = routerEndpoint 110 retNode.operatorUUID = hbUUID 111 return retNode, nil 112 } 113 114 func (i *svcNode) String() string { 115 i.Lock() 116 defer i.Unlock() 117 return fmt.Sprintf("ServiceNode %s", i.Instance.String()) 118 } 119 120 func (i *svcNode) heartbeatReceived() bool { 121 return !i.heartbeater.lastHeartbeatTime().IsZero() 122 } 123 124 func (i *svcNode) Setup( 125 bld build.ServiceBuild, 126 conf build.ServiceConfiguration, 127 token string, 128 force bool, 129 ) error { 130 i.Lock() 131 defer i.Unlock() 132 if i.status != StatusUninitialized && 133 i.status != StatusSetup { 134 return errUnableToSetupInitializedNode 135 } 136 137 i.currentConf = conf 138 i.currentBuild = bld 139 140 freq := uint32(i.opts.HeartbeatOptions().Interval().Seconds()) 141 err := i.opts.Retrier().Attempt(func() error { 142 ctx := context.Background() 143 _, err := i.client.Setup(ctx, &m3em.SetupRequest{ 144 OperatorUuid: i.operatorUUID, 145 SessionToken: token, 146 Force: force, 147 HeartbeatEnabled: i.opts.HeartbeatOptions().Enabled(), 148 HeartbeatEndpoint: i.heartbeatEndpoint, 149 HeartbeatFrequencySecs: freq, 150 }) 151 return err 152 }) 153 154 if err != nil { 155 return fmt.Errorf("unable to setup: %v", err) 156 } 157 158 // TODO(prateek): make heartbeat pickup existing agent state 159 160 // Wait till we receive our first heartbeat 161 if i.opts.HeartbeatOptions().Enabled() { 162 i.logger.Info("waiting until initial heartbeat is received") 163 received := xclock.WaitUntil(i.heartbeatReceived, i.opts.HeartbeatOptions().Timeout()) 164 if !received { 165 return fmt.Errorf("did not receive heartbeat response from remote agent within timeout") 166 } 167 i.logger.Info("initial heartbeat received") 168 169 // start hb monitoring 170 if err := i.heartbeater.start(); err != nil { 171 return fmt.Errorf("unable to start heartbeat monitor loop: %v", err) 172 } 173 } 174 175 // transfer build 176 if err := i.opts.Retrier().Attempt(func() error { 177 iter, err := bld.Iter(i.opts.TransferBufferSize()) 178 if err != nil { 179 return err 180 } 181 return i.transferFile(transferOpts{ 182 targets: []string{bld.ID()}, 183 fileType: m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_BINARY, 184 overwrite: force, 185 iter: iter, 186 }) 187 }); err != nil { 188 return fmt.Errorf("unable to transfer build: %v", err) 189 } 190 191 if err := i.opts.Retrier().Attempt(func() error { 192 iter, err := conf.Iter(i.opts.TransferBufferSize()) 193 if err != nil { 194 return err 195 } 196 return i.transferFile(transferOpts{ 197 targets: []string{conf.ID()}, 198 fileType: m3em.PushFileType_PUSH_FILE_TYPE_SERVICE_CONFIG, 199 overwrite: force, 200 iter: iter, 201 }) 202 }); err != nil { 203 return fmt.Errorf("unable to transfer config: %v", err) 204 } 205 206 i.status = StatusSetup 207 return nil 208 } 209 210 // nolint: maligned 211 type transferOpts struct { 212 targets []string 213 fileType m3em.PushFileType 214 iter fs.FileReaderIter 215 overwrite bool 216 } 217 218 func (i *svcNode) transferFile( 219 t transferOpts, 220 ) error { 221 defer t.iter.Close() 222 ctx := context.Background() 223 stream, err := i.client.PushFile(ctx) 224 if err != nil { 225 return err 226 } 227 chunkIdx := 0 228 for ; t.iter.Next(); chunkIdx++ { 229 bytes := t.iter.Current() 230 request := &m3em.PushFileRequest{ 231 Type: t.fileType, 232 TargetPaths: t.targets, 233 Overwrite: t.overwrite, 234 Data: &m3em.DataChunk{ 235 Bytes: bytes, 236 Idx: int32(chunkIdx), 237 }, 238 } 239 err := stream.Send(request) 240 if err != nil { 241 stream.CloseSend() 242 return err 243 } 244 } 245 if err := t.iter.Err(); err != nil { 246 stream.CloseSend() 247 return err 248 } 249 250 response, err := stream.CloseAndRecv() 251 if err != nil { 252 return err 253 } 254 255 if int(response.NumChunksRecvd) != chunkIdx { 256 return fmt.Errorf("sent %d chunks, server only received %d of them", chunkIdx, response.NumChunksRecvd) 257 } 258 259 if t.iter.Checksum() != response.FileChecksum { 260 return fmt.Errorf("expected file checksum: %d, received: %d", t.iter.Checksum(), response.FileChecksum) 261 } 262 263 return nil 264 } 265 266 func (i *svcNode) TransferLocalFile( 267 srcPath string, 268 destPaths []string, 269 overwrite bool, 270 ) error { 271 i.Lock() 272 defer i.Unlock() 273 274 if i.status != StatusSetup && i.status != StatusRunning { 275 return errUnableToTransferFile 276 } 277 278 if err := i.opts.Retrier().Attempt(func() error { 279 iter, err := fs.NewSizedFileReaderIter(srcPath, i.opts.TransferBufferSize()) 280 if err != nil { 281 return err 282 } 283 return i.transferFile(transferOpts{ 284 targets: destPaths, 285 fileType: m3em.PushFileType_PUSH_FILE_TYPE_DATA_FILE, 286 overwrite: overwrite, 287 iter: iter, 288 }) 289 }); err != nil { 290 return fmt.Errorf("unable to transfer file: %v", err) 291 } 292 293 return nil 294 } 295 296 func (i *svcNode) pullRemoteFile(t m3em.PullFileType, fd *os.File) (bool, error) { 297 ctx := context.Background() 298 299 // resetting file in case this a retry 300 if err := fd.Truncate(0); err != nil { 301 return false, err 302 } 303 304 // create streaming client 305 client, err := i.client.PullFile(ctx, &m3em.PullFileRequest{ 306 ChunkSize: int64(i.opts.TransferBufferSize()), 307 MaxSize: i.opts.MaxPullSize(), 308 FileType: t, 309 }) 310 if err != nil { 311 return false, err 312 } 313 314 // iterate through responses 315 truncated := false 316 for { 317 response, err := client.Recv() 318 switch err { 319 case nil: // this Recv was successful, and we have more to read 320 truncated = response.Truncated 321 if _, writeErr := fd.Write(response.Data.Bytes); writeErr != nil { 322 return truncated, writeErr 323 } 324 325 case io.EOF: // no more to read, indicate success 326 return truncated, nil 327 328 default: // unexpected error, indicate failure 329 return truncated, err 330 } 331 } 332 } 333 334 func toM3EMPullType(t RemoteOutputType) (m3em.PullFileType, error) { 335 switch t { 336 case RemoteProcessStderr: 337 return m3em.PullFileType_PULL_FILE_TYPE_SERVICE_STDERR, nil 338 339 case RemoteProcessStdout: 340 return m3em.PullFileType_PULL_FILE_TYPE_SERVICE_STDOUT, nil 341 342 default: 343 return m3em.PullFileType_PULL_FILE_TYPE_UNKNOWN, fmt.Errorf("unknown output type: %v", t) 344 } 345 } 346 347 func (i *svcNode) GetRemoteOutput( 348 t RemoteOutputType, 349 localDest string, 350 ) (bool, error) { 351 i.Lock() 352 defer i.Unlock() 353 354 if i.status != StatusSetup && i.status != StatusRunning { 355 return false, errUnableToTransferFile 356 } 357 358 mType, err := toM3EMPullType(t) 359 if err != nil { 360 return false, err 361 } 362 363 // create base directory for specified remote path if it doesn't exist 364 base := filepath.Dir(localDest) 365 if err := os.MkdirAll(base, os.FileMode(0755)|os.ModeDir); err != nil { 366 return false, err 367 } 368 369 fd, err := os.OpenFile(localDest, os.O_CREATE|os.O_WRONLY, os.FileMode(0666)) 370 if err != nil { 371 return false, err 372 } 373 374 truncated := false 375 if retryErr := i.opts.Retrier().Attempt(func() error { 376 truncated, err = i.pullRemoteFile(mType, fd) 377 return err 378 }); retryErr != nil { 379 return truncated, fmt.Errorf("unable to get remote output: %v", retryErr) 380 } 381 382 return truncated, fd.Close() 383 } 384 385 func (i *svcNode) Teardown() error { 386 i.Lock() 387 defer i.Unlock() 388 if status := i.status; status != StatusRunning && 389 status != StatusSetup && 390 status != StatusError { 391 return errUnableToTeardownNode 392 } 393 394 // clear any listeners 395 i.listeners.clear() 396 397 if err := i.opts.Retrier().Attempt(func() error { 398 ctx := context.Background() 399 _, err := i.client.Teardown(ctx, &m3em.TeardownRequest{}) 400 return err 401 }); err != nil { 402 return err 403 } 404 405 if err := i.Close(); err != nil { 406 return err 407 } 408 409 i.status = StatusUninitialized 410 return nil 411 } 412 413 func (i *svcNode) Close() error { 414 var err xerrors.MultiError 415 416 if conn := i.clientConn; conn != nil { 417 err = err.Add(conn.Close()) 418 i.clientConn = nil 419 } 420 421 if hbServer := i.heartbeater; hbServer != nil { 422 hbServer.stop() 423 err = err.Add(i.opts.HeartbeatOptions().HeartbeatRouter().Deregister(i.operatorUUID)) 424 i.heartbeater = nil 425 i.operatorUUID = "" 426 } 427 428 return err.FinalError() 429 } 430 431 func (i *svcNode) Start() error { 432 i.Lock() 433 defer i.Unlock() 434 if i.status != StatusSetup { 435 return errUnableToStartNode 436 } 437 438 if err := i.opts.Retrier().Attempt(func() error { 439 ctx := context.Background() 440 _, err := i.client.Start(ctx, &m3em.StartRequest{}) 441 return err 442 }); err != nil { 443 return err 444 } 445 446 i.status = StatusRunning 447 return nil 448 } 449 450 func (i *svcNode) Stop() error { 451 i.Lock() 452 defer i.Unlock() 453 if i.status != StatusRunning { 454 return errUnableToStopNode 455 } 456 457 if err := i.opts.Retrier().Attempt(func() error { 458 ctx := context.Background() 459 _, err := i.client.Stop(ctx, &m3em.StopRequest{}) 460 return err 461 }); err != nil { 462 return err 463 } 464 465 i.status = StatusSetup 466 return nil 467 } 468 469 func (i *svcNode) Status() Status { 470 i.Lock() 471 defer i.Unlock() 472 return i.status 473 } 474 475 func (i *svcNode) RegisterListener(l Listener) ListenerID { 476 return ListenerID(i.listeners.add(l)) 477 } 478 479 func (i *svcNode) DeregisterListener(token ListenerID) { 480 i.listeners.remove(int(token)) 481 }