github.com/lirm/aeron-go@v0.0.0-20230415210743-920325491dc4/archive/replaymerge/replaymerge.go (about)

     1  // Licensed under the Apache License, Version 2.0 (the "License");
     2  // you may not use this file except in compliance with the License.
     3  // You may obtain a copy of the License at
     4  //
     5  // http://www.apache.org/licenses/LICENSE-2.0
     6  //
     7  // Unless required by applicable law or agreed to in writing, software
     8  // distributed under the License is distributed on an "AS IS" BASIS,
     9  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    10  // See the License for the specific language governing permissions and
    11  // limitations under the License.
    12  
    13  package replaymerge
    14  
    15  import (
    16  	"fmt"
    17  	"strings"
    18  	"time"
    19  
    20  	"github.com/lirm/aeron-go/aeron"
    21  	"github.com/lirm/aeron-go/aeron/logbuffer/term"
    22  	"github.com/lirm/aeron-go/archive"
    23  )
    24  
    25  const LiveAddMaxWindow = int32(32 * 1024 * 1024)
    26  const ReplayRemoveThreshold = int64(0)
    27  const MergeProgressTimeoutDefaultMs = int64(5 * time.Millisecond)
    28  
    29  type State int
    30  
    31  const (
    32  	StateResolveReplayPort State = iota
    33  	StateGetRecordingPosition
    34  	StateReplay
    35  	StateCatchup
    36  	StateAttemptLiveJoin
    37  	StateMerged
    38  	StateFailed
    39  	StateClosed
    40  )
    41  
    42  func (s State) String() string {
    43  	return [...]string{"ResolveReplayPort", "GetRecordingPosition", "Replay", "Catchup", "AttemptLiveJoin", "Merged", "Failed", "Closed"}[s]
    44  }
    45  
    46  // ReplayMerge replays a recorded stream from a starting position and merge with live stream for a full history of a stream.
    47  //
    48  // Once constructed either of Poll or DoWork, interleaved with consumption
    49  // of the Image, should be called in a duty cycle loop until IsMerged is true.
    50  // After which the ReplayMerge can be closed and continued usage can be made of the Image or its
    51  // parent Subscription. If an exception occurs or progress stops, the merge will fail and
    52  // HasFailed will be true.
    53  //
    54  // If the endpoint on the replay destination uses a port of 0, then the OS will assign a port from the ephemeral
    55  // range and this will be added to the replay channel for instructing the archive.
    56  //
    57  // NOTE: Merging is only supported with UDP streams.
    58  type ReplayMerge struct {
    59  	recordingId            int64
    60  	startPosition          int64
    61  	mergeProgressTimeoutMs int64
    62  	replaySessionId        int64
    63  	activeCorrelationId    int64
    64  	nextTargetPosition     int64
    65  	positionOfLastProgress int64
    66  	timeOfLastProgressMs   int64
    67  	isLiveAdded            bool
    68  	isReplayActive         bool
    69  	state                  State
    70  	image                  aeron.Image
    71  
    72  	archive           *archive.Archive
    73  	subscription      *aeron.Subscription
    74  	replayDestination string
    75  	liveDestination   string
    76  	replayEndpoint    string
    77  	replayChannelUri  aeron.ChannelUri
    78  }
    79  
    80  // NewReplayMerge creates a ReplayMerge to manage the merging of a replayed stream and switching over to live stream as
    81  // appropriate.
    82  //
    83  // Parameters:
    84  //
    85  // subscription           to use for the replay and live stream. Must be a multi-destination subscription.
    86  // archive                to use for the replay.
    87  // replayChannel          to as a template for what the archive will use.
    88  // replayDestination      to send the replay to and the destination added by the Subscription.
    89  // liveDestination        for the live stream and the destination added by the Subscription.
    90  // recordingId            for the replay.
    91  // startPosition          for the replay.
    92  // epochClock             to use for progress checks.
    93  // mergeProgressTimeoutMs to use for progress checks.
    94  func NewReplayMerge(
    95  	subscription *aeron.Subscription,
    96  	archive *archive.Archive,
    97  	replayChannel string,
    98  	replayDestination string,
    99  	liveDestination string,
   100  	recordingId int64,
   101  	startPosition int64,
   102  	mergeProgressTimeoutMs int64) (rm *ReplayMerge, err error) {
   103  	if strings.HasPrefix(subscription.Channel(), aeron.IpcChannel) ||
   104  		strings.HasPrefix(replayChannel, aeron.IpcChannel) ||
   105  		strings.HasPrefix(replayDestination, aeron.IpcChannel) ||
   106  		strings.HasPrefix(liveDestination, aeron.IpcChannel) {
   107  		err = fmt.Errorf("IPC merging is not supported")
   108  		return
   109  	}
   110  
   111  	if !strings.Contains(subscription.Channel(), "control-mode=manual") {
   112  		err = fmt.Errorf("Subscription URI must have 'control-mode=manual' uri=%s", subscription.Channel())
   113  		return
   114  	}
   115  
   116  	rm = &ReplayMerge{
   117  		archive:                archive,
   118  		subscription:           subscription,
   119  		replayDestination:      replayDestination,
   120  		liveDestination:        liveDestination,
   121  		recordingId:            recordingId,
   122  		startPosition:          startPosition,
   123  		mergeProgressTimeoutMs: mergeProgressTimeoutMs,
   124  		replaySessionId:        aeron.NullValue,
   125  		activeCorrelationId:    aeron.NullValue,
   126  		nextTargetPosition:     aeron.NullValue,
   127  		positionOfLastProgress: aeron.NullValue,
   128  	}
   129  
   130  	rm.replayChannelUri, err = aeron.ParseChannelUri(replayChannel)
   131  	if err != nil {
   132  		err = fmt.Errorf("Invalid replay channel '%s'", replayChannel)
   133  		return
   134  	}
   135  
   136  	rm.replayChannelUri.Set(aeron.LingerParamName, "0")
   137  	rm.replayChannelUri.Set(aeron.EosParamName, "false")
   138  
   139  	var replayDestinationUri aeron.ChannelUri
   140  	replayDestinationUri, err = aeron.ParseChannelUri(replayDestination)
   141  	if err != nil {
   142  		err = fmt.Errorf("Invalid replay destination '%s'", replayDestination)
   143  		return
   144  	}
   145  	rm.replayEndpoint = replayDestinationUri.Get(aeron.EndpointParamName)
   146  	if strings.HasSuffix(rm.replayEndpoint, ":0") {
   147  		rm.state = StateResolveReplayPort
   148  	} else {
   149  		rm.replayChannelUri.Set(aeron.EndpointParamName, rm.replayEndpoint)
   150  		rm.state = StateGetRecordingPosition
   151  	}
   152  
   153  	subscription.AddDestination(replayDestination)
   154  	rm.timeOfLastProgressMs = time.Now().UnixMilli()
   155  	return
   156  }
   157  
   158  // Close closes and stops any active replay. Will remove the replay destination from the subscription.
   159  // This operation Will NOT remove the live destination if it has been added, so it can be used for live consumption.
   160  func (rm *ReplayMerge) Close() {
   161  	state := rm.state
   162  	if StateClosed != state {
   163  		if !rm.archive.Aeron().IsClosed() {
   164  			if StateMerged != state {
   165  				rm.subscription.RemoveDestination(rm.replayDestination)
   166  			}
   167  
   168  			if rm.isReplayActive && rm.archive.Proxy.Publication.IsConnected() {
   169  				rm.stopReplay()
   170  			}
   171  		}
   172  
   173  		rm.setState(StateClosed)
   174  	}
   175  }
   176  
   177  // Subscription returns the Subscription used to consume the replayed and merged stream.
   178  func (rm *ReplayMerge) Subscription() *aeron.Subscription {
   179  	return rm.subscription
   180  }
   181  
   182  // DoWork performs the work of replaying and merging. Should only be used if polling the underlying Image directly.
   183  // Returns indication of work done processing the merge.
   184  func (rm *ReplayMerge) DoWork() (workCount int, err error) {
   185  	nowMs := time.Now().UnixMilli()
   186  
   187  	switch rm.state {
   188  	case StateResolveReplayPort:
   189  		workCount, err = rm.resolveReplayPort(nowMs)
   190  		if err != nil {
   191  			rm.setState(StateFailed)
   192  			return
   193  		}
   194  		if err = rm.checkProgress(nowMs); err != nil {
   195  			rm.setState(StateFailed)
   196  			return
   197  		}
   198  	case StateGetRecordingPosition:
   199  		workCount, err = rm.getRecordingPosition(nowMs)
   200  		if err != nil {
   201  			rm.setState(StateFailed)
   202  			return
   203  		}
   204  		if err = rm.checkProgress(nowMs); err != nil {
   205  			rm.setState(StateFailed)
   206  			return
   207  		}
   208  	case StateReplay:
   209  		workCount, err = rm.replay(nowMs)
   210  		if err != nil {
   211  			rm.setState(StateFailed)
   212  			return
   213  		}
   214  		if err = rm.checkProgress(nowMs); err != nil {
   215  			rm.setState(StateFailed)
   216  			return
   217  		}
   218  	case StateCatchup:
   219  		workCount, err = rm.catchup(nowMs)
   220  		if err != nil {
   221  			rm.setState(StateFailed)
   222  			return
   223  		}
   224  		if err = rm.checkProgress(nowMs); err != nil {
   225  			rm.setState(StateFailed)
   226  			return
   227  		}
   228  	case StateAttemptLiveJoin:
   229  		workCount, err = rm.attemptLiveJoin(nowMs)
   230  		if err != nil {
   231  			rm.setState(StateFailed)
   232  			return
   233  		}
   234  		if err = rm.checkProgress(nowMs); err != nil {
   235  			rm.setState(StateFailed)
   236  			return
   237  		}
   238  	}
   239  	return
   240  }
   241  
   242  // Poll polls the Image used for replay and merging and live stream. The doWork method
   243  // will be called before the poll so that processing of the merge can be done.
   244  //
   245  // Returns number of fragments processed.
   246  func (rm *ReplayMerge) Poll(fragmentHandler term.FragmentHandler, fragmentLimit int) (workCount int, err error) {
   247  	workCount, err = rm.DoWork()
   248  	if err != nil {
   249  		return
   250  	}
   251  	if rm.image == nil {
   252  		return
   253  	}
   254  	return rm.image.Poll(fragmentHandler, fragmentLimit), nil
   255  }
   256  
   257  // IsMerged returns if the live stream merged and the replay stopped?
   258  func (rm *ReplayMerge) IsMerged() bool {
   259  	return rm.state == StateMerged
   260  }
   261  
   262  // HasFailed returns if the replay merge failed due to an error?
   263  func (rm *ReplayMerge) HasFailed() bool {
   264  	return rm.state == StateFailed
   265  }
   266  
   267  // Image returns the image which is a merge of the replay and live stream.
   268  func (rm *ReplayMerge) Image() aeron.Image {
   269  	return rm.image
   270  }
   271  
   272  // IsLiveAdded returns if the live destination added to the subscription.
   273  func (rm *ReplayMerge) IsLiveAdded() bool {
   274  	return rm.isLiveAdded
   275  }
   276  
   277  func (rm *ReplayMerge) resolveReplayPort(nowMs int64) (workCount int, err error) {
   278  	resolvedEndpoint := rm.subscription.ResolvedEndpoint()
   279  	if resolvedEndpoint != "" {
   280  		i := strings.LastIndex(resolvedEndpoint, ":")
   281  		rm.replayChannelUri.Set(aeron.EndpointParamName,
   282  			rm.replayEndpoint[0:len(rm.replayEndpoint)-2]+resolvedEndpoint[i:])
   283  
   284  		rm.timeOfLastProgressMs = nowMs
   285  		rm.setState(StateGetRecordingPosition)
   286  		workCount += 1
   287  	}
   288  
   289  	return
   290  }
   291  
   292  func (rm *ReplayMerge) getRecordingPosition(nowMs int64) (workCount int, err error) {
   293  	if aeron.NullValue == rm.activeCorrelationId {
   294  		correlationId := rm.archive.Aeron().NextCorrelationID()
   295  
   296  		if rm.archive.Proxy.RecordingPositionRequest(correlationId, rm.recordingId) == nil {
   297  			rm.activeCorrelationId = correlationId
   298  			rm.timeOfLastProgressMs = nowMs
   299  			workCount += 1
   300  		}
   301  		return
   302  	}
   303  
   304  	var success bool
   305  	success, err = rm.pollForResponse()
   306  	if err != nil {
   307  		return
   308  	}
   309  	if success {
   310  		rm.nextTargetPosition = rm.polledRelevantId()
   311  		rm.activeCorrelationId = aeron.NullValue
   312  
   313  		if archive.RecordingPositionNull == rm.nextTargetPosition {
   314  			correlationId := rm.archive.Aeron().NextCorrelationID()
   315  
   316  			if rm.archive.Proxy.StopPositionRequest(correlationId, rm.recordingId) == nil {
   317  				rm.activeCorrelationId = correlationId
   318  				rm.timeOfLastProgressMs = nowMs
   319  				workCount += 1
   320  			}
   321  		} else {
   322  			rm.timeOfLastProgressMs = nowMs
   323  			rm.setState(StateReplay)
   324  		}
   325  	}
   326  
   327  	workCount += 1
   328  
   329  	return
   330  }
   331  
   332  func (rm *ReplayMerge) replay(nowMs int64) (workCount int, err error) {
   333  	if aeron.NullValue == rm.activeCorrelationId {
   334  		correlationId := rm.archive.Aeron().NextCorrelationID()
   335  		if rm.archive.Proxy.ReplayRequest(
   336  			correlationId,
   337  			rm.recordingId,
   338  			rm.startPosition,
   339  			archive.RecordingLengthMax,
   340  			rm.replayChannelUri.String(),
   341  			rm.subscription.StreamID()) == nil {
   342  			rm.activeCorrelationId = correlationId
   343  			rm.timeOfLastProgressMs = nowMs
   344  			workCount += 1
   345  		}
   346  		return
   347  	}
   348  
   349  	var success bool
   350  	success, err = rm.pollForResponse()
   351  	if err != nil {
   352  		return
   353  	}
   354  	if success {
   355  		rm.isReplayActive = true
   356  		rm.replaySessionId = rm.polledRelevantId()
   357  		rm.timeOfLastProgressMs = nowMs
   358  		rm.setState(StateCatchup)
   359  		workCount += 1
   360  	}
   361  	return
   362  }
   363  
   364  func (rm *ReplayMerge) catchup(nowMs int64) (workCount int, err error) {
   365  
   366  	if rm.image == nil && rm.subscription.IsConnected() {
   367  		rm.timeOfLastProgressMs = nowMs
   368  		rm.image = rm.subscription.ImageBySessionID(int32(rm.replaySessionId))
   369  		rm.positionOfLastProgress = aeron.NullValue
   370  		if rm.image != nil {
   371  			rm.positionOfLastProgress = rm.image.Position()
   372  		}
   373  	}
   374  
   375  	if rm.image != nil {
   376  		position := rm.image.Position()
   377  		if position >= rm.nextTargetPosition {
   378  			rm.timeOfLastProgressMs = nowMs
   379  			rm.positionOfLastProgress = position
   380  			rm.setState(StateAttemptLiveJoin)
   381  			workCount += 1
   382  		} else if position > rm.positionOfLastProgress {
   383  			rm.timeOfLastProgressMs = nowMs
   384  			rm.positionOfLastProgress = position
   385  		} else if rm.image.IsClosed() {
   386  			err = fmt.Errorf("ReplayMerge Image closed unexpectedly.")
   387  			return
   388  		}
   389  	}
   390  	return
   391  }
   392  
   393  func (rm *ReplayMerge) attemptLiveJoin(nowMs int64) (workCount int, err error) {
   394  
   395  	if aeron.NullValue == rm.activeCorrelationId {
   396  		correlationId := rm.archive.Aeron().NextCorrelationID()
   397  		if rm.archive.Proxy.RecordingPositionRequest(correlationId, rm.recordingId) == nil {
   398  			rm.activeCorrelationId = correlationId
   399  			workCount += 1
   400  		}
   401  		return
   402  	}
   403  
   404  	var success bool
   405  	success, err = rm.pollForResponse()
   406  	if err != nil {
   407  		return
   408  	}
   409  	if success {
   410  		rm.nextTargetPosition = rm.polledRelevantId()
   411  		rm.activeCorrelationId = aeron.NullValue
   412  
   413  		if archive.RecordingPositionNull == rm.nextTargetPosition {
   414  			correlationId := rm.archive.Aeron().NextCorrelationID()
   415  			if rm.archive.Proxy.RecordingPositionRequest(correlationId, rm.recordingId) == nil {
   416  				rm.activeCorrelationId = correlationId
   417  			}
   418  		} else {
   419  			nextState := StateCatchup
   420  
   421  			if rm.image != nil {
   422  				position := rm.image.Position()
   423  
   424  				if rm.shouldAddLiveDestination(position) {
   425  					rm.subscription.AddDestination(rm.liveDestination)
   426  					rm.timeOfLastProgressMs = nowMs
   427  					rm.positionOfLastProgress = position
   428  					rm.isLiveAdded = true
   429  				} else if rm.shouldStopAndRemoveReplay(position) {
   430  					rm.subscription.RemoveDestination(rm.replayDestination)
   431  					rm.stopReplay()
   432  					rm.timeOfLastProgressMs = nowMs
   433  					rm.positionOfLastProgress = position
   434  					nextState = StateMerged
   435  				}
   436  			}
   437  
   438  			rm.setState(nextState)
   439  		}
   440  
   441  		workCount += 1
   442  	}
   443  
   444  	return
   445  }
   446  
   447  func (rm *ReplayMerge) stopReplay() {
   448  	correlationId := rm.archive.Aeron().NextCorrelationID()
   449  	if rm.archive.Proxy.StopReplayRequest(correlationId, rm.replaySessionId) == nil {
   450  		rm.isReplayActive = false
   451  	}
   452  }
   453  
   454  func (rm *ReplayMerge) setState(newState State) {
   455  	rm.state = newState
   456  	rm.activeCorrelationId = aeron.NullValue
   457  }
   458  
   459  func (rm *ReplayMerge) shouldAddLiveDestination(position int64) bool {
   460  	mn := rm.image.TermBufferLength() >> 2
   461  	if mn > LiveAddMaxWindow {
   462  		mn = LiveAddMaxWindow
   463  	}
   464  	return !rm.isLiveAdded &&
   465  		(rm.nextTargetPosition-position) <= int64(mn)
   466  }
   467  
   468  func (rm *ReplayMerge) shouldStopAndRemoveReplay(position int64) bool {
   469  	return rm.isLiveAdded &&
   470  		(rm.nextTargetPosition-position) <= ReplayRemoveThreshold &&
   471  		rm.image.ActiveTransportCount() >= 2
   472  }
   473  
   474  func (rm *ReplayMerge) checkProgress(nowMs int64) error {
   475  	if nowMs > (rm.timeOfLastProgressMs + rm.mergeProgressTimeoutMs) {
   476  		return fmt.Errorf("ReplayMerge no progress: state=%s", rm.state)
   477  	}
   478  	return nil
   479  }
   480  
   481  // Returns whether this succeeded, and what the error is.
   482  func (rm *ReplayMerge) pollForResponse() (bool, error) {
   483  	correlationId := rm.activeCorrelationId
   484  	poller := rm.archive.Control
   485  
   486  	if poller.Poll() > 0 && poller.Results.IsPollComplete {
   487  		if poller.Results.ControlResponse.ControlSessionId == rm.archive.SessionID {
   488  			if poller.Results.ErrorResponse != nil {
   489  				err := fmt.Errorf(
   490  					"archive response for correlationId=%d, error=%s",
   491  					correlationId,
   492  					poller.Results.ErrorResponse,
   493  				)
   494  				return false, err
   495  			}
   496  		}
   497  		return poller.Results.CorrelationId == correlationId, nil
   498  	}
   499  	// TODO: (false, nil) is what was here before, but I suspect that (true, nil) is more accurate?
   500  	// Check this when revamping archive code.
   501  	return false, nil
   502  }
   503  
   504  func (rm *ReplayMerge) polledRelevantId() int64 {
   505  	poller := rm.archive.Control
   506  	return poller.Results.ControlResponse.RelevantId
   507  }