go.uber.org/cadence@v1.2.9/internal/session.go (about) 1 // Copyright (c) 2017 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package internal 22 23 import ( 24 "context" 25 "errors" 26 "fmt" 27 "sync" 28 "time" 29 30 "github.com/pborman/uuid" 31 "go.uber.org/zap" 32 33 "go.uber.org/cadence/internal/common/backoff" 34 ) 35 36 type ( 37 // SessionInfo contains information of a created session. For now, exported 38 // fields are SessionID and HostName. 39 // SessionID is a uuid generated when CreateSession() or RecreateSession() 40 // is called and can be used to uniquely identify a session. 41 // HostName specifies which host is executing the session 42 SessionInfo struct { 43 SessionID string 44 HostName string 45 resourceID string // hide from user for now 46 tasklist string // resource specific tasklist 47 sessionState sessionState 48 sessionCancelFunc CancelFunc // cancel func for the session context, used by both creation activity and user activities 49 completionCtx Context // context for executing the completion activity 50 } 51 52 // SessionOptions specifies metadata for a session. 53 // ExecutionTimeout: required, no default 54 // Specifies the maximum amount of time the session can run 55 // CreationTimeout: required, no default 56 // Specifies how long session creation can take before returning an error 57 // HeartbeatTimeout: optional, default 20s 58 // Specifies the heartbeat timeout. If heartbeat is not received by server 59 // within the timeout, the session will be declared as failed 60 SessionOptions struct { 61 ExecutionTimeout time.Duration 62 CreationTimeout time.Duration 63 HeartbeatTimeout time.Duration 64 } 65 66 recreateSessionParams struct { 67 Tasklist string 68 } 69 70 sessionState int 71 72 sessionTokenBucket struct { 73 *sync.Cond 74 availableToken int 75 } 76 77 sessionEnvironment interface { 78 CreateSession(ctx context.Context, sessionID string) (<-chan struct{}, error) 79 CompleteSession(sessionID string) 80 AddSessionToken() 81 SignalCreationResponse(ctx context.Context, sessionID string) error 82 GetResourceSpecificTasklist() string 83 GetTokenBucket() *sessionTokenBucket 84 } 85 86 sessionEnvironmentImpl struct { 87 *sync.Mutex 88 doneChanMap map[string]chan struct{} 89 resourceID string 90 resourceSpecificTasklist string 91 sessionTokenBucket *sessionTokenBucket 92 } 93 94 sessionCreationResponse struct { 95 Tasklist string 96 HostName string 97 ResourceID string 98 } 99 ) 100 101 // Session State enum 102 const ( 103 sessionStateOpen sessionState = iota 104 sessionStateFailed 105 sessionStateClosed 106 ) 107 108 const ( 109 sessionInfoContextKey contextKey = "sessionInfo" 110 sessionEnvironmentContextKey contextKey = "sessionEnvironment" 111 112 sessionCreationActivityName string = "internalSessionCreationActivity" 113 sessionCompletionActivityName string = "internalSessionCompletionActivity" 114 115 errTooManySessionsMsg string = "too many outstanding sessions" 116 117 defaultSessionHeartbeatTimeout time.Duration = time.Second * 20 118 maxSessionHeartbeatInterval time.Duration = time.Second * 10 119 ) 120 121 var ( 122 // ErrSessionFailed is the error returned when user tries to execute an activity but the 123 // session it belongs to has already failed 124 ErrSessionFailed = errors.New("session has failed") 125 errFoundExistingOpenSession = errors.New("found exisiting open session in the context") 126 ) 127 128 // Note: Worker should be configured to process session. To do this, set the following 129 // fields in WorkerOptions: 130 // EnableSessionWorker: true 131 // SessionResourceID: The identifier of the resource consumed by sessions. 132 // It's the user's responsibility to ensure there's only one worker using this resourceID. 133 // This option is not available for now as automatic session reestablishing is not implemented. 134 // MaxConcurrentSessionExecutionSize: the maximum number of concurrently sessions the resource 135 // support. By default, 1000 is used. 136 137 // CreateSession creates a session and returns a new context which contains information 138 // of the created session. The session will be created on the tasklist user specified in 139 // ActivityOptions. If none is specified, the default one will be used. 140 // 141 // CreationSession will fail in the following situations: 142 // 1. The context passed in already contains a session which is still open 143 // (not closed and failed). 144 // 2. All the workers are busy (number of sessions currently running on all the workers have reached 145 // MaxConcurrentSessionExecutionSize, which is specified when starting the workers) and session 146 // cannot be created within a specified timeout. 147 // 148 // If an activity is executed using the returned context, it's regarded as part of the 149 // session. All activities within the same session will be executed by the same worker. 150 // User still needs to handle the error returned when executing an activity. Session will 151 // not be marked as failed if an activity within it returns an error. Only when the worker 152 // executing the session is down, that session will be marked as failed. Executing an activity 153 // within a failed session will return ErrSessionFailed immediately without scheduling that activity. 154 // 155 // The returned session Context will be cancelled if the session fails (worker died) or CompleteSession() 156 // is called. This means that in these two cases, all user activities scheduled using the returned session 157 // Context will also be cancelled. 158 // 159 // If user wants to end a session since activity returns some error, use CompleteSession API below. 160 // New session can be created if necessary to retry the whole session. 161 // 162 // Example: 163 // 164 // so := &SessionOptions{ 165 // ExecutionTimeout: time.Minute, 166 // CreationTimeout: time.Minute, 167 // } 168 // sessionCtx, err := CreateSession(ctx, so) 169 // if err != nil { 170 // // Creation failed. Wrong ctx or too many outstanding sessions. 171 // } 172 // defer CompleteSession(sessionCtx) 173 // err = ExecuteActivity(sessionCtx, someActivityFunc, activityInput).Get(sessionCtx, nil) 174 // if err == ErrSessionFailed { 175 // // Session has failed 176 // } else { 177 // // Handle activity error 178 // } 179 // ... // execute more activities using sessionCtx 180 func CreateSession(ctx Context, sessionOptions *SessionOptions) (Context, error) { 181 options := getActivityOptions(ctx) 182 baseTasklist := options.TaskListName 183 if baseTasklist == "" { 184 baseTasklist = options.OriginalTaskListName 185 } 186 return createSession(ctx, getCreationTasklist(baseTasklist), sessionOptions, true) 187 } 188 189 // RecreateSession recreate a session based on the sessionInfo passed in. Activities executed within 190 // the recreated session will be executed by the same worker as the previous session. RecreateSession() 191 // returns an error under the same situation as CreateSession() or the token passed in is invalid. 192 // It also has the same usage as CreateSession(). 193 // 194 // The main usage of RecreateSession is for long sessions that are splited into multiple runs. At the end of 195 // one run, complete the current session, get recreateToken from sessionInfo by calling SessionInfo.GetRecreateToken() 196 // and pass the token to the next run. In the new run, session can be recreated using that token. 197 func RecreateSession(ctx Context, recreateToken []byte, sessionOptions *SessionOptions) (Context, error) { 198 recreateParams, err := deserializeRecreateToken(recreateToken) 199 if err != nil { 200 return nil, fmt.Errorf("failed to deserilalize recreate token: %v", err) 201 } 202 return createSession(ctx, recreateParams.Tasklist, sessionOptions, true) 203 } 204 205 // CompleteSession completes a session. It releases worker resources, so other sessions can be created. 206 // CompleteSession won't do anything if the context passed in doesn't contain any session information or the 207 // session has already completed or failed. 208 // 209 // After a session has completed, user can continue to use the context, but the activities will be scheduled 210 // on the normal taskList (as user specified in ActivityOptions) and may be picked up by another worker since 211 // it's not in a session. 212 func CompleteSession(ctx Context) { 213 sessionInfo := getSessionInfo(ctx) 214 if sessionInfo == nil || sessionInfo.sessionState != sessionStateOpen { 215 return 216 } 217 218 // first cancel both the creation activity and all user activities 219 // this will cancel the ctx passed into this function 220 sessionInfo.sessionCancelFunc() 221 222 // then execute then completion activity using the completionCtx, which is not cancelled. 223 completionCtx := WithActivityOptions(sessionInfo.completionCtx, ActivityOptions{ 224 ScheduleToStartTimeout: time.Second * 3, 225 StartToCloseTimeout: time.Second * 3, 226 }) 227 228 // even though the creation activity has been cancelled, the session worker doesn't know. The worker will wait until 229 // next heartbeat to figure out that the workflow is completed and then release the resource. We need to make sure the 230 // completion activity is executed before the workflow exits. 231 // the tasklist will be overrided to use the one stored in sessionInfo. 232 err := ExecuteActivity(completionCtx, sessionCompletionActivityName, sessionInfo.SessionID).Get(completionCtx, nil) 233 if err != nil { 234 GetLogger(completionCtx).Warn("Complete session activity failed", zap.Error(err)) 235 } 236 237 sessionInfo.sessionState = sessionStateClosed 238 getWorkflowEnvironment(ctx).RemoveSession(sessionInfo.SessionID) 239 GetLogger(ctx).Debug("Completed session", zap.String("sessionID", sessionInfo.SessionID)) 240 } 241 242 // GetSessionInfo returns the sessionInfo stored in the context. If there are multiple sessions in the context, 243 // (for example, the same context is used to create, complete, create another session. Then user found that the 244 // session has failed, and created a new one on it), the most recent sessionInfo will be returned. 245 // 246 // This API will return nil if there's no sessionInfo in the context. 247 func GetSessionInfo(ctx Context) *SessionInfo { 248 info := getSessionInfo(ctx) 249 if info == nil { 250 GetLogger(ctx).Warn("Context contains no session information") 251 } 252 return info 253 } 254 255 // GetRecreateToken returns the token needed to recreate a session. The returned value should be passed to 256 // RecreateSession() API. 257 func (s *SessionInfo) GetRecreateToken() []byte { 258 params := recreateSessionParams{ 259 Tasklist: s.tasklist, 260 } 261 return mustSerializeRecreateToken(¶ms) 262 } 263 264 func getSessionInfo(ctx Context) *SessionInfo { 265 info := ctx.Value(sessionInfoContextKey) 266 if info == nil { 267 return nil 268 } 269 return info.(*SessionInfo) 270 } 271 272 func setSessionInfo(ctx Context, sessionInfo *SessionInfo) Context { 273 return WithValue(ctx, sessionInfoContextKey, sessionInfo) 274 } 275 276 func createSession(ctx Context, creationTasklist string, options *SessionOptions, retryable bool) (Context, error) { 277 logger := GetLogger(ctx) 278 logger.Debug("Start creating session") 279 if prevSessionInfo := getSessionInfo(ctx); prevSessionInfo != nil && prevSessionInfo.sessionState == sessionStateOpen { 280 return nil, errFoundExistingOpenSession 281 } 282 sessionID, err := generateSessionID(ctx) 283 if err != nil { 284 return nil, err 285 } 286 287 tasklistChan := GetSignalChannel(ctx, sessionID) // use sessionID as channel name 288 // Retry is only needed when creating new session and the error returned is NewCustomError(errTooManySessionsMsg) 289 retryPolicy := &RetryPolicy{ 290 InitialInterval: time.Second, 291 BackoffCoefficient: 1.1, 292 MaximumInterval: time.Second * 10, 293 ExpirationInterval: options.CreationTimeout, 294 NonRetriableErrorReasons: []string{ 295 "cadenceInternal:Panic", 296 "cadenceInternal:Generic", 297 "cadenceInternal:Timeout START_TO_CLOSE", 298 "cadenceInternal:Timeout HEARTBEAT", 299 }, 300 } 301 302 heartbeatTimeout := defaultSessionHeartbeatTimeout 303 if options.HeartbeatTimeout != time.Duration(0) { 304 heartbeatTimeout = options.HeartbeatTimeout 305 } 306 ao := ActivityOptions{ 307 TaskList: creationTasklist, 308 ScheduleToStartTimeout: options.CreationTimeout, 309 StartToCloseTimeout: options.ExecutionTimeout, 310 HeartbeatTimeout: heartbeatTimeout, 311 } 312 if retryable { 313 ao.RetryPolicy = retryPolicy 314 } 315 316 sessionInfo := &SessionInfo{ 317 SessionID: sessionID, 318 sessionState: sessionStateOpen, 319 } 320 completionCtx := setSessionInfo(ctx, sessionInfo) 321 sessionInfo.completionCtx = completionCtx 322 323 // create sessionCtx as a child ctx as the completionCtx for two reasons: 324 // 1. completionCtx still needs the session information 325 // 2. When completing session, we need to cancel both creation activity and all user activities, but 326 // we can't cancel the completionCtx. 327 sessionCtx, sessionCancelFunc := WithCancel(completionCtx) 328 creationCtx := WithActivityOptions(sessionCtx, ao) 329 creationFuture := ExecuteActivity(creationCtx, sessionCreationActivityName, sessionID) 330 331 var creationErr error 332 var creationResponse sessionCreationResponse 333 s := NewSelector(creationCtx) 334 s.AddReceive(tasklistChan, func(c Channel, more bool) { 335 c.Receive(creationCtx, &creationResponse) 336 }) 337 s.AddFuture(creationFuture, func(f Future) { 338 // activity stoped before signal is received, must be creation timeout. 339 creationErr = f.Get(creationCtx, nil) 340 GetLogger(creationCtx).Debug("Failed to create session", zap.String("sessionID", sessionID), zap.Error(creationErr)) 341 }) 342 s.Select(creationCtx) 343 344 if creationErr != nil { 345 sessionCancelFunc() 346 return nil, creationErr 347 } 348 349 sessionInfo.tasklist = creationResponse.Tasklist 350 sessionInfo.resourceID = creationResponse.ResourceID 351 sessionInfo.HostName = creationResponse.HostName 352 sessionInfo.sessionCancelFunc = sessionCancelFunc 353 354 Go(creationCtx, func(creationCtx Context) { 355 err := creationFuture.Get(creationCtx, nil) 356 if err == nil { 357 return 358 } 359 if _, ok := err.(*CanceledError); !ok { 360 getWorkflowEnvironment(creationCtx).RemoveSession(sessionID) 361 GetLogger(creationCtx).Debug("Session failed", zap.String("sessionID", sessionID), zap.Error(err)) 362 sessionInfo.sessionState = sessionStateFailed 363 sessionCancelFunc() 364 } 365 }) 366 367 logger.Debug("Created session", zap.String("sessionID", sessionID)) 368 getWorkflowEnvironment(ctx).AddSession(sessionInfo) 369 return sessionCtx, nil 370 } 371 372 func generateSessionID(ctx Context) (string, error) { 373 var sessionID string 374 err := SideEffect(ctx, func(ctx Context) interface{} { 375 return uuid.New() 376 }).Get(&sessionID) 377 return sessionID, err 378 } 379 380 func getCreationTasklist(base string) string { 381 return base + "__internal_session_creation" 382 } 383 384 func getResourceSpecificTasklist(resourceID string) string { 385 return resourceID + "@" + getHostName() 386 } 387 388 func sessionCreationActivity(ctx context.Context, sessionID string) error { 389 sessionEnv, ok := ctx.Value(sessionEnvironmentContextKey).(sessionEnvironment) 390 if !ok { 391 panic("no session environment in context") 392 } 393 394 doneCh, err := sessionEnv.CreateSession(ctx, sessionID) 395 if err != nil { 396 return err 397 } 398 399 defer sessionEnv.AddSessionToken() 400 401 if err := sessionEnv.SignalCreationResponse(ctx, sessionID); err != nil { 402 return err 403 } 404 405 activityEnv := getActivityEnv(ctx) 406 heartbeatInterval := activityEnv.heartbeatTimeout / 3 407 if heartbeatInterval > maxSessionHeartbeatInterval { 408 heartbeatInterval = maxSessionHeartbeatInterval 409 } 410 ticker := time.NewTicker(heartbeatInterval) 411 defer ticker.Stop() 412 413 heartbeatRetryPolicy := backoff.NewExponentialRetryPolicy(time.Second) 414 heartbeatRetryPolicy.SetMaximumInterval(time.Second * 2) 415 heartbeatRetryPolicy.SetExpirationInterval(heartbeatInterval) 416 417 for { 418 select { 419 case <-ctx.Done(): 420 sessionEnv.CompleteSession(sessionID) 421 return ctx.Err() 422 case <-ticker.C: 423 heartbeatOp := func() error { 424 // here we skip the internal heartbeat batching, as otherwise the activity has only once chance 425 // for heartbeating and if that failed, the entire session will get fail due to heartbeat timeout. 426 // since the heartbeat interval is controlled by the session framework, we don't need to worry about 427 // calling heartbeat too frequently and causing trouble for the sever. (note the min heartbeat timeout 428 // is 1 sec.) 429 return activityEnv.serviceInvoker.Heartbeat([]byte{}) 430 } 431 isRetryable := func(_ error) bool { 432 // there will be two types of error here: 433 // 1. transient errors like timeout, in which case we should not fail the session 434 // 2. non-retryable errors like activity cancelled, activity not found or domain 435 // not active. In those cases, the internal implementation will cancel the context, 436 // so in the next iteration, ctx.Done() will be selected. Here we rely on the heartbeat 437 // internal implementation to tell which error is non-retryable. 438 select { 439 case <-ctx.Done(): 440 return false 441 default: 442 return true 443 } 444 } 445 err := backoff.Retry(ctx, heartbeatOp, heartbeatRetryPolicy, isRetryable) 446 if err != nil { 447 GetActivityLogger(ctx).Info("session heartbeat failed", zap.Error(err), zap.String("sessionID", sessionID)) 448 } 449 case <-doneCh: 450 return nil 451 } 452 } 453 } 454 455 func sessionCompletionActivity(ctx context.Context, sessionID string) error { 456 sessionEnv, ok := ctx.Value(sessionEnvironmentContextKey).(sessionEnvironment) 457 if !ok { 458 panic("no session environment in context") 459 } 460 sessionEnv.CompleteSession(sessionID) 461 return nil 462 } 463 464 func isSessionCreationActivity(activity interface{}) bool { 465 activityName, ok := activity.(string) 466 return ok && activityName == sessionCreationActivityName 467 } 468 469 func mustSerializeRecreateToken(params *recreateSessionParams) []byte { 470 dc := getDefaultDataConverter() 471 token, err := dc.ToData(params) 472 if err != nil { 473 panic(err) 474 } 475 return token 476 } 477 478 func deserializeRecreateToken(token []byte) (*recreateSessionParams, error) { 479 dc := getDefaultDataConverter() 480 var recreateParams recreateSessionParams 481 err := dc.FromData(token, &recreateParams) 482 return &recreateParams, err 483 } 484 485 func newSessionTokenBucket(concurrentSessionExecutionSize int) *sessionTokenBucket { 486 return &sessionTokenBucket{ 487 Cond: sync.NewCond(&sync.Mutex{}), 488 availableToken: concurrentSessionExecutionSize, 489 } 490 } 491 492 func (t *sessionTokenBucket) waitForAvailableToken() { 493 t.L.Lock() 494 defer t.L.Unlock() 495 for t.availableToken == 0 { 496 t.Wait() 497 } 498 } 499 500 func (t *sessionTokenBucket) addToken() { 501 t.L.Lock() 502 t.availableToken++ 503 t.L.Unlock() 504 t.Signal() 505 } 506 507 func (t *sessionTokenBucket) getToken() bool { 508 t.L.Lock() 509 defer t.L.Unlock() 510 if t.availableToken == 0 { 511 return false 512 } 513 t.availableToken-- 514 return true 515 } 516 517 func newSessionEnvironment(resourceID string, concurrentSessionExecutionSize int) sessionEnvironment { 518 return &sessionEnvironmentImpl{ 519 Mutex: &sync.Mutex{}, 520 doneChanMap: make(map[string]chan struct{}), 521 resourceID: resourceID, 522 resourceSpecificTasklist: getResourceSpecificTasklist(resourceID), 523 sessionTokenBucket: newSessionTokenBucket(concurrentSessionExecutionSize), 524 } 525 } 526 527 func (env *sessionEnvironmentImpl) CreateSession(ctx context.Context, sessionID string) (<-chan struct{}, error) { 528 if !env.sessionTokenBucket.getToken() { 529 return nil, NewCustomError(errTooManySessionsMsg) 530 } 531 532 env.Lock() 533 defer env.Unlock() 534 doneCh := make(chan struct{}) 535 env.doneChanMap[sessionID] = doneCh 536 return doneCh, nil 537 } 538 539 func (env *sessionEnvironmentImpl) AddSessionToken() { 540 env.sessionTokenBucket.addToken() 541 } 542 543 func (env *sessionEnvironmentImpl) SignalCreationResponse(ctx context.Context, sessionID string) error { 544 activityEnv := getActivityEnv(ctx) 545 546 signalInput, err := encodeArg(getDefaultDataConverter(), env.getCreationResponse()) 547 if err != nil { 548 return err 549 } 550 551 return activityEnv.serviceInvoker.SignalWorkflow( 552 ctx, 553 activityEnv.workflowDomain, 554 activityEnv.workflowExecution.ID, 555 activityEnv.workflowExecution.RunID, 556 sessionID, 557 signalInput, 558 ) 559 } 560 561 func (env *sessionEnvironmentImpl) getCreationResponse() *sessionCreationResponse { 562 return &sessionCreationResponse{ 563 Tasklist: env.resourceSpecificTasklist, 564 ResourceID: env.resourceID, 565 HostName: getHostName(), 566 } 567 } 568 569 func (env *sessionEnvironmentImpl) CompleteSession(sessionID string) { 570 env.Lock() 571 defer env.Unlock() 572 573 if doneChan, ok := env.doneChanMap[sessionID]; ok { 574 delete(env.doneChanMap, sessionID) 575 close(doneChan) 576 } 577 } 578 579 func (env *sessionEnvironmentImpl) GetResourceSpecificTasklist() string { 580 return env.resourceSpecificTasklist 581 } 582 583 func (env *sessionEnvironmentImpl) GetTokenBucket() *sessionTokenBucket { 584 return env.sessionTokenBucket 585 } 586 587 // The following two implemention is for testsuite only. The only difference is that 588 // the creation activity is not long running, otherwise it will block timers from auto firing. 589 func sessionCreationActivityForTest(ctx context.Context, sessionID string) error { 590 sessionEnv := ctx.Value(sessionEnvironmentContextKey).(sessionEnvironment) 591 592 if _, err := sessionEnv.CreateSession(ctx, sessionID); err != nil { 593 return err 594 } 595 596 return sessionEnv.SignalCreationResponse(ctx, sessionID) 597 } 598 599 func sessionCompletionActivityForTest(ctx context.Context, sessionID string) error { 600 sessionEnv := ctx.Value(sessionEnvironmentContextKey).(sessionEnvironment) 601 602 sessionEnv.CompleteSession(sessionID) 603 604 // Add session token in the completion activity. 605 sessionEnv.AddSessionToken() 606 return nil 607 }