github.com/lirm/aeron-go@v0.0.0-20230415210743-920325491dc4/archive/replaymerge/replaymerge.go (about) 1 // Licensed under the Apache License, Version 2.0 (the "License"); 2 // you may not use this file except in compliance with the License. 3 // You may obtain a copy of the License at 4 // 5 // http://www.apache.org/licenses/LICENSE-2.0 6 // 7 // Unless required by applicable law or agreed to in writing, software 8 // distributed under the License is distributed on an "AS IS" BASIS, 9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 // See the License for the specific language governing permissions and 11 // limitations under the License. 12 13 package replaymerge 14 15 import ( 16 "fmt" 17 "strings" 18 "time" 19 20 "github.com/lirm/aeron-go/aeron" 21 "github.com/lirm/aeron-go/aeron/logbuffer/term" 22 "github.com/lirm/aeron-go/archive" 23 ) 24 25 const LiveAddMaxWindow = int32(32 * 1024 * 1024) 26 const ReplayRemoveThreshold = int64(0) 27 const MergeProgressTimeoutDefaultMs = int64(5 * time.Millisecond) 28 29 type State int 30 31 const ( 32 StateResolveReplayPort State = iota 33 StateGetRecordingPosition 34 StateReplay 35 StateCatchup 36 StateAttemptLiveJoin 37 StateMerged 38 StateFailed 39 StateClosed 40 ) 41 42 func (s State) String() string { 43 return [...]string{"ResolveReplayPort", "GetRecordingPosition", "Replay", "Catchup", "AttemptLiveJoin", "Merged", "Failed", "Closed"}[s] 44 } 45 46 // ReplayMerge replays a recorded stream from a starting position and merge with live stream for a full history of a stream. 47 // 48 // Once constructed either of Poll or DoWork, interleaved with consumption 49 // of the Image, should be called in a duty cycle loop until IsMerged is true. 50 // After which the ReplayMerge can be closed and continued usage can be made of the Image or its 51 // parent Subscription. If an exception occurs or progress stops, the merge will fail and 52 // HasFailed will be true. 53 // 54 // If the endpoint on the replay destination uses a port of 0, then the OS will assign a port from the ephemeral 55 // range and this will be added to the replay channel for instructing the archive. 56 // 57 // NOTE: Merging is only supported with UDP streams. 58 type ReplayMerge struct { 59 recordingId int64 60 startPosition int64 61 mergeProgressTimeoutMs int64 62 replaySessionId int64 63 activeCorrelationId int64 64 nextTargetPosition int64 65 positionOfLastProgress int64 66 timeOfLastProgressMs int64 67 isLiveAdded bool 68 isReplayActive bool 69 state State 70 image aeron.Image 71 72 archive *archive.Archive 73 subscription *aeron.Subscription 74 replayDestination string 75 liveDestination string 76 replayEndpoint string 77 replayChannelUri aeron.ChannelUri 78 } 79 80 // NewReplayMerge creates a ReplayMerge to manage the merging of a replayed stream and switching over to live stream as 81 // appropriate. 82 // 83 // Parameters: 84 // 85 // subscription to use for the replay and live stream. Must be a multi-destination subscription. 86 // archive to use for the replay. 87 // replayChannel to as a template for what the archive will use. 88 // replayDestination to send the replay to and the destination added by the Subscription. 89 // liveDestination for the live stream and the destination added by the Subscription. 90 // recordingId for the replay. 91 // startPosition for the replay. 92 // epochClock to use for progress checks. 93 // mergeProgressTimeoutMs to use for progress checks. 94 func NewReplayMerge( 95 subscription *aeron.Subscription, 96 archive *archive.Archive, 97 replayChannel string, 98 replayDestination string, 99 liveDestination string, 100 recordingId int64, 101 startPosition int64, 102 mergeProgressTimeoutMs int64) (rm *ReplayMerge, err error) { 103 if strings.HasPrefix(subscription.Channel(), aeron.IpcChannel) || 104 strings.HasPrefix(replayChannel, aeron.IpcChannel) || 105 strings.HasPrefix(replayDestination, aeron.IpcChannel) || 106 strings.HasPrefix(liveDestination, aeron.IpcChannel) { 107 err = fmt.Errorf("IPC merging is not supported") 108 return 109 } 110 111 if !strings.Contains(subscription.Channel(), "control-mode=manual") { 112 err = fmt.Errorf("Subscription URI must have 'control-mode=manual' uri=%s", subscription.Channel()) 113 return 114 } 115 116 rm = &ReplayMerge{ 117 archive: archive, 118 subscription: subscription, 119 replayDestination: replayDestination, 120 liveDestination: liveDestination, 121 recordingId: recordingId, 122 startPosition: startPosition, 123 mergeProgressTimeoutMs: mergeProgressTimeoutMs, 124 replaySessionId: aeron.NullValue, 125 activeCorrelationId: aeron.NullValue, 126 nextTargetPosition: aeron.NullValue, 127 positionOfLastProgress: aeron.NullValue, 128 } 129 130 rm.replayChannelUri, err = aeron.ParseChannelUri(replayChannel) 131 if err != nil { 132 err = fmt.Errorf("Invalid replay channel '%s'", replayChannel) 133 return 134 } 135 136 rm.replayChannelUri.Set(aeron.LingerParamName, "0") 137 rm.replayChannelUri.Set(aeron.EosParamName, "false") 138 139 var replayDestinationUri aeron.ChannelUri 140 replayDestinationUri, err = aeron.ParseChannelUri(replayDestination) 141 if err != nil { 142 err = fmt.Errorf("Invalid replay destination '%s'", replayDestination) 143 return 144 } 145 rm.replayEndpoint = replayDestinationUri.Get(aeron.EndpointParamName) 146 if strings.HasSuffix(rm.replayEndpoint, ":0") { 147 rm.state = StateResolveReplayPort 148 } else { 149 rm.replayChannelUri.Set(aeron.EndpointParamName, rm.replayEndpoint) 150 rm.state = StateGetRecordingPosition 151 } 152 153 subscription.AddDestination(replayDestination) 154 rm.timeOfLastProgressMs = time.Now().UnixMilli() 155 return 156 } 157 158 // Close closes and stops any active replay. Will remove the replay destination from the subscription. 159 // This operation Will NOT remove the live destination if it has been added, so it can be used for live consumption. 160 func (rm *ReplayMerge) Close() { 161 state := rm.state 162 if StateClosed != state { 163 if !rm.archive.Aeron().IsClosed() { 164 if StateMerged != state { 165 rm.subscription.RemoveDestination(rm.replayDestination) 166 } 167 168 if rm.isReplayActive && rm.archive.Proxy.Publication.IsConnected() { 169 rm.stopReplay() 170 } 171 } 172 173 rm.setState(StateClosed) 174 } 175 } 176 177 // Subscription returns the Subscription used to consume the replayed and merged stream. 178 func (rm *ReplayMerge) Subscription() *aeron.Subscription { 179 return rm.subscription 180 } 181 182 // DoWork performs the work of replaying and merging. Should only be used if polling the underlying Image directly. 183 // Returns indication of work done processing the merge. 184 func (rm *ReplayMerge) DoWork() (workCount int, err error) { 185 nowMs := time.Now().UnixMilli() 186 187 switch rm.state { 188 case StateResolveReplayPort: 189 workCount, err = rm.resolveReplayPort(nowMs) 190 if err != nil { 191 rm.setState(StateFailed) 192 return 193 } 194 if err = rm.checkProgress(nowMs); err != nil { 195 rm.setState(StateFailed) 196 return 197 } 198 case StateGetRecordingPosition: 199 workCount, err = rm.getRecordingPosition(nowMs) 200 if err != nil { 201 rm.setState(StateFailed) 202 return 203 } 204 if err = rm.checkProgress(nowMs); err != nil { 205 rm.setState(StateFailed) 206 return 207 } 208 case StateReplay: 209 workCount, err = rm.replay(nowMs) 210 if err != nil { 211 rm.setState(StateFailed) 212 return 213 } 214 if err = rm.checkProgress(nowMs); err != nil { 215 rm.setState(StateFailed) 216 return 217 } 218 case StateCatchup: 219 workCount, err = rm.catchup(nowMs) 220 if err != nil { 221 rm.setState(StateFailed) 222 return 223 } 224 if err = rm.checkProgress(nowMs); err != nil { 225 rm.setState(StateFailed) 226 return 227 } 228 case StateAttemptLiveJoin: 229 workCount, err = rm.attemptLiveJoin(nowMs) 230 if err != nil { 231 rm.setState(StateFailed) 232 return 233 } 234 if err = rm.checkProgress(nowMs); err != nil { 235 rm.setState(StateFailed) 236 return 237 } 238 } 239 return 240 } 241 242 // Poll polls the Image used for replay and merging and live stream. The doWork method 243 // will be called before the poll so that processing of the merge can be done. 244 // 245 // Returns number of fragments processed. 246 func (rm *ReplayMerge) Poll(fragmentHandler term.FragmentHandler, fragmentLimit int) (workCount int, err error) { 247 workCount, err = rm.DoWork() 248 if err != nil { 249 return 250 } 251 if rm.image == nil { 252 return 253 } 254 return rm.image.Poll(fragmentHandler, fragmentLimit), nil 255 } 256 257 // IsMerged returns if the live stream merged and the replay stopped? 258 func (rm *ReplayMerge) IsMerged() bool { 259 return rm.state == StateMerged 260 } 261 262 // HasFailed returns if the replay merge failed due to an error? 263 func (rm *ReplayMerge) HasFailed() bool { 264 return rm.state == StateFailed 265 } 266 267 // Image returns the image which is a merge of the replay and live stream. 268 func (rm *ReplayMerge) Image() aeron.Image { 269 return rm.image 270 } 271 272 // IsLiveAdded returns if the live destination added to the subscription. 273 func (rm *ReplayMerge) IsLiveAdded() bool { 274 return rm.isLiveAdded 275 } 276 277 func (rm *ReplayMerge) resolveReplayPort(nowMs int64) (workCount int, err error) { 278 resolvedEndpoint := rm.subscription.ResolvedEndpoint() 279 if resolvedEndpoint != "" { 280 i := strings.LastIndex(resolvedEndpoint, ":") 281 rm.replayChannelUri.Set(aeron.EndpointParamName, 282 rm.replayEndpoint[0:len(rm.replayEndpoint)-2]+resolvedEndpoint[i:]) 283 284 rm.timeOfLastProgressMs = nowMs 285 rm.setState(StateGetRecordingPosition) 286 workCount += 1 287 } 288 289 return 290 } 291 292 func (rm *ReplayMerge) getRecordingPosition(nowMs int64) (workCount int, err error) { 293 if aeron.NullValue == rm.activeCorrelationId { 294 correlationId := rm.archive.Aeron().NextCorrelationID() 295 296 if rm.archive.Proxy.RecordingPositionRequest(correlationId, rm.recordingId) == nil { 297 rm.activeCorrelationId = correlationId 298 rm.timeOfLastProgressMs = nowMs 299 workCount += 1 300 } 301 return 302 } 303 304 var success bool 305 success, err = rm.pollForResponse() 306 if err != nil { 307 return 308 } 309 if success { 310 rm.nextTargetPosition = rm.polledRelevantId() 311 rm.activeCorrelationId = aeron.NullValue 312 313 if archive.RecordingPositionNull == rm.nextTargetPosition { 314 correlationId := rm.archive.Aeron().NextCorrelationID() 315 316 if rm.archive.Proxy.StopPositionRequest(correlationId, rm.recordingId) == nil { 317 rm.activeCorrelationId = correlationId 318 rm.timeOfLastProgressMs = nowMs 319 workCount += 1 320 } 321 } else { 322 rm.timeOfLastProgressMs = nowMs 323 rm.setState(StateReplay) 324 } 325 } 326 327 workCount += 1 328 329 return 330 } 331 332 func (rm *ReplayMerge) replay(nowMs int64) (workCount int, err error) { 333 if aeron.NullValue == rm.activeCorrelationId { 334 correlationId := rm.archive.Aeron().NextCorrelationID() 335 if rm.archive.Proxy.ReplayRequest( 336 correlationId, 337 rm.recordingId, 338 rm.startPosition, 339 archive.RecordingLengthMax, 340 rm.replayChannelUri.String(), 341 rm.subscription.StreamID()) == nil { 342 rm.activeCorrelationId = correlationId 343 rm.timeOfLastProgressMs = nowMs 344 workCount += 1 345 } 346 return 347 } 348 349 var success bool 350 success, err = rm.pollForResponse() 351 if err != nil { 352 return 353 } 354 if success { 355 rm.isReplayActive = true 356 rm.replaySessionId = rm.polledRelevantId() 357 rm.timeOfLastProgressMs = nowMs 358 rm.setState(StateCatchup) 359 workCount += 1 360 } 361 return 362 } 363 364 func (rm *ReplayMerge) catchup(nowMs int64) (workCount int, err error) { 365 366 if rm.image == nil && rm.subscription.IsConnected() { 367 rm.timeOfLastProgressMs = nowMs 368 rm.image = rm.subscription.ImageBySessionID(int32(rm.replaySessionId)) 369 rm.positionOfLastProgress = aeron.NullValue 370 if rm.image != nil { 371 rm.positionOfLastProgress = rm.image.Position() 372 } 373 } 374 375 if rm.image != nil { 376 position := rm.image.Position() 377 if position >= rm.nextTargetPosition { 378 rm.timeOfLastProgressMs = nowMs 379 rm.positionOfLastProgress = position 380 rm.setState(StateAttemptLiveJoin) 381 workCount += 1 382 } else if position > rm.positionOfLastProgress { 383 rm.timeOfLastProgressMs = nowMs 384 rm.positionOfLastProgress = position 385 } else if rm.image.IsClosed() { 386 err = fmt.Errorf("ReplayMerge Image closed unexpectedly.") 387 return 388 } 389 } 390 return 391 } 392 393 func (rm *ReplayMerge) attemptLiveJoin(nowMs int64) (workCount int, err error) { 394 395 if aeron.NullValue == rm.activeCorrelationId { 396 correlationId := rm.archive.Aeron().NextCorrelationID() 397 if rm.archive.Proxy.RecordingPositionRequest(correlationId, rm.recordingId) == nil { 398 rm.activeCorrelationId = correlationId 399 workCount += 1 400 } 401 return 402 } 403 404 var success bool 405 success, err = rm.pollForResponse() 406 if err != nil { 407 return 408 } 409 if success { 410 rm.nextTargetPosition = rm.polledRelevantId() 411 rm.activeCorrelationId = aeron.NullValue 412 413 if archive.RecordingPositionNull == rm.nextTargetPosition { 414 correlationId := rm.archive.Aeron().NextCorrelationID() 415 if rm.archive.Proxy.RecordingPositionRequest(correlationId, rm.recordingId) == nil { 416 rm.activeCorrelationId = correlationId 417 } 418 } else { 419 nextState := StateCatchup 420 421 if rm.image != nil { 422 position := rm.image.Position() 423 424 if rm.shouldAddLiveDestination(position) { 425 rm.subscription.AddDestination(rm.liveDestination) 426 rm.timeOfLastProgressMs = nowMs 427 rm.positionOfLastProgress = position 428 rm.isLiveAdded = true 429 } else if rm.shouldStopAndRemoveReplay(position) { 430 rm.subscription.RemoveDestination(rm.replayDestination) 431 rm.stopReplay() 432 rm.timeOfLastProgressMs = nowMs 433 rm.positionOfLastProgress = position 434 nextState = StateMerged 435 } 436 } 437 438 rm.setState(nextState) 439 } 440 441 workCount += 1 442 } 443 444 return 445 } 446 447 func (rm *ReplayMerge) stopReplay() { 448 correlationId := rm.archive.Aeron().NextCorrelationID() 449 if rm.archive.Proxy.StopReplayRequest(correlationId, rm.replaySessionId) == nil { 450 rm.isReplayActive = false 451 } 452 } 453 454 func (rm *ReplayMerge) setState(newState State) { 455 rm.state = newState 456 rm.activeCorrelationId = aeron.NullValue 457 } 458 459 func (rm *ReplayMerge) shouldAddLiveDestination(position int64) bool { 460 mn := rm.image.TermBufferLength() >> 2 461 if mn > LiveAddMaxWindow { 462 mn = LiveAddMaxWindow 463 } 464 return !rm.isLiveAdded && 465 (rm.nextTargetPosition-position) <= int64(mn) 466 } 467 468 func (rm *ReplayMerge) shouldStopAndRemoveReplay(position int64) bool { 469 return rm.isLiveAdded && 470 (rm.nextTargetPosition-position) <= ReplayRemoveThreshold && 471 rm.image.ActiveTransportCount() >= 2 472 } 473 474 func (rm *ReplayMerge) checkProgress(nowMs int64) error { 475 if nowMs > (rm.timeOfLastProgressMs + rm.mergeProgressTimeoutMs) { 476 return fmt.Errorf("ReplayMerge no progress: state=%s", rm.state) 477 } 478 return nil 479 } 480 481 // Returns whether this succeeded, and what the error is. 482 func (rm *ReplayMerge) pollForResponse() (bool, error) { 483 correlationId := rm.activeCorrelationId 484 poller := rm.archive.Control 485 486 if poller.Poll() > 0 && poller.Results.IsPollComplete { 487 if poller.Results.ControlResponse.ControlSessionId == rm.archive.SessionID { 488 if poller.Results.ErrorResponse != nil { 489 err := fmt.Errorf( 490 "archive response for correlationId=%d, error=%s", 491 correlationId, 492 poller.Results.ErrorResponse, 493 ) 494 return false, err 495 } 496 } 497 return poller.Results.CorrelationId == correlationId, nil 498 } 499 // TODO: (false, nil) is what was here before, but I suspect that (true, nil) is more accurate? 500 // Check this when revamping archive code. 501 return false, nil 502 } 503 504 func (rm *ReplayMerge) polledRelevantId() int64 { 505 poller := rm.archive.Control 506 return poller.Results.ControlResponse.RelevantId 507 }