github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/pkg/jobstore/inmemory/inmemory.go (about)

     1  package inmemory
     2  
     3  import (
     4  	"context"
     5  	"sort"
     6  	"time"
     7  
     8  	sync "github.com/bacalhau-project/golang-mutex-tracer"
     9  	"github.com/imdario/mergo"
    10  	"golang.org/x/exp/maps"
    11  	"golang.org/x/exp/slices"
    12  
    13  	"github.com/filecoin-project/bacalhau/pkg/bacerrors"
    14  	jobutils "github.com/filecoin-project/bacalhau/pkg/job"
    15  	"github.com/filecoin-project/bacalhau/pkg/jobstore"
    16  	"github.com/filecoin-project/bacalhau/pkg/model"
    17  )
    18  
    19  const newJobComment = "Job created"
    20  
    21  type JobStore struct {
    22  	// we keep pointers to these things because we will update them partially
    23  	jobs       map[string]model.Job
    24  	states     map[string]model.JobState
    25  	history    map[string][]model.JobHistory
    26  	inprogress map[string]struct{}
    27  	mtx        sync.RWMutex
    28  }
    29  
    30  func NewJobStore() *JobStore {
    31  	res := &JobStore{
    32  		jobs:       make(map[string]model.Job),
    33  		states:     make(map[string]model.JobState),
    34  		history:    make(map[string][]model.JobHistory),
    35  		inprogress: make(map[string]struct{}),
    36  	}
    37  	res.mtx.EnableTracerWithOpts(sync.Opts{
    38  		Threshold: 10 * time.Millisecond,
    39  		Id:        "InMemoryJobStore.mtx",
    40  	})
    41  	return res
    42  }
    43  
    44  // Gets a job from the datastore.
    45  //
    46  // Errors:
    47  //
    48  //   - error-job-not-found        		  -- if the job is not found
    49  func (d *JobStore) GetJob(_ context.Context, id string) (model.Job, error) {
    50  	d.mtx.RLock()
    51  	defer d.mtx.RUnlock()
    52  	return d.getJob(id)
    53  }
    54  
    55  func (d *JobStore) GetJobs(ctx context.Context, query jobstore.JobQuery) ([]model.Job, error) {
    56  	d.mtx.RLock()
    57  	defer d.mtx.RUnlock()
    58  	var result []model.Job
    59  
    60  	if query.ID != "" {
    61  		j, err := d.getJob(query.ID)
    62  		if err != nil {
    63  			return nil, err
    64  		}
    65  		return []model.Job{j}, nil
    66  	}
    67  
    68  	for _, j := range maps.Values(d.jobs) {
    69  		if query.Limit > 0 && len(result) == query.Limit {
    70  			break
    71  		}
    72  
    73  		if !query.ReturnAll && query.ClientID != "" && query.ClientID != j.Metadata.ClientID {
    74  			// Job is not for the requesting client, so ignore it.
    75  			continue
    76  		}
    77  
    78  		// If we are not using include tags, by default every job is included.
    79  		// If a job is specifically included, that overrides it being excluded.
    80  		included := len(query.IncludeTags) == 0
    81  		for _, tag := range j.Spec.Annotations {
    82  			if slices.Contains(query.IncludeTags, model.IncludedTag(tag)) {
    83  				included = true
    84  				break
    85  			}
    86  			if slices.Contains(query.ExcludeTags, model.ExcludedTag(tag)) {
    87  				included = false
    88  				break
    89  			}
    90  		}
    91  
    92  		if !included {
    93  			continue
    94  		}
    95  
    96  		result = append(result, j)
    97  	}
    98  
    99  	listSorter := func(i, j int) bool {
   100  		switch query.SortBy {
   101  		case "id":
   102  			if query.SortReverse {
   103  				// what does it mean to sort by ID?
   104  				return result[i].Metadata.ID > result[j].Metadata.ID
   105  			} else {
   106  				return result[i].Metadata.ID < result[j].Metadata.ID
   107  			}
   108  		case "created_at":
   109  			if query.SortReverse {
   110  				return result[i].Metadata.CreatedAt.UTC().Unix() > result[j].Metadata.CreatedAt.UTC().Unix()
   111  			} else {
   112  				return result[i].Metadata.CreatedAt.UTC().Unix() < result[j].Metadata.CreatedAt.UTC().Unix()
   113  			}
   114  		default:
   115  			return false
   116  		}
   117  	}
   118  	sort.Slice(result, listSorter)
   119  	return result, nil
   120  }
   121  
   122  func (d *JobStore) GetJobState(_ context.Context, jobID string) (model.JobState, error) {
   123  	d.mtx.RLock()
   124  	defer d.mtx.RUnlock()
   125  	state, ok := d.states[jobID]
   126  	if !ok {
   127  		return model.JobState{}, bacerrors.NewJobNotFound(jobID)
   128  	}
   129  	return state, nil
   130  }
   131  
   132  func (d *JobStore) GetInProgressJobs(ctx context.Context) ([]model.JobWithInfo, error) {
   133  	d.mtx.RLock()
   134  	defer d.mtx.RUnlock()
   135  	var result []model.JobWithInfo
   136  	for id := range d.inprogress {
   137  		result = append(result, model.JobWithInfo{
   138  			Job:   d.jobs[id],
   139  			State: d.states[id],
   140  		})
   141  	}
   142  	return result, nil
   143  }
   144  
   145  func (d *JobStore) GetJobHistory(_ context.Context, jobID string) ([]model.JobHistory, error) {
   146  	d.mtx.RLock()
   147  	defer d.mtx.RUnlock()
   148  	history, ok := d.history[jobID]
   149  	if !ok {
   150  		return nil, jobstore.NewErrJobNotFound(jobID)
   151  	}
   152  	return history, nil
   153  }
   154  
   155  func (d *JobStore) GetJobsCount(ctx context.Context, query jobstore.JobQuery) (int, error) {
   156  	useQuery := query
   157  	useQuery.Limit = 0
   158  	useQuery.Offset = 0
   159  	jobs, err := d.GetJobs(ctx, useQuery)
   160  	if err != nil {
   161  		return 0, err
   162  	}
   163  	return len(jobs), nil
   164  }
   165  
   166  func (d *JobStore) CreateJob(_ context.Context, job model.Job) error {
   167  	d.mtx.Lock()
   168  	defer d.mtx.Unlock()
   169  	existingJob, ok := d.jobs[job.Metadata.ID]
   170  	if ok {
   171  		return jobstore.NewErrJobAlreadyExists(existingJob.Metadata.ID)
   172  	}
   173  	d.jobs[job.Metadata.ID] = job
   174  
   175  	// populate shard states
   176  	shardStates := make(map[int]model.ShardState, job.Spec.ExecutionPlan.TotalShards)
   177  	for i := 0; i < job.Spec.ExecutionPlan.TotalShards; i++ {
   178  		shardStates[i] = model.ShardState{
   179  			JobID:      job.Metadata.ID,
   180  			ShardIndex: i,
   181  			State:      model.ShardStateInProgress,
   182  			Version:    1,
   183  			CreateTime: time.Now(),
   184  			UpdateTime: time.Now(),
   185  		}
   186  	}
   187  
   188  	// populate job state
   189  	jobState := model.JobState{
   190  		JobID:      job.Metadata.ID,
   191  		Shards:     shardStates,
   192  		State:      model.JobStateInProgress,
   193  		Version:    1,
   194  		CreateTime: time.Now(),
   195  		UpdateTime: time.Now(),
   196  	}
   197  	d.states[job.Metadata.ID] = jobState
   198  	d.inprogress[job.Metadata.ID] = struct{}{}
   199  	d.appendJobHistory(jobState, model.JobStateNew, newJobComment)
   200  	return nil
   201  }
   202  
   203  // helper method to read a single job from memory. This is used by both GetJob and GetJobs.
   204  // It is important that we don't attempt to acquire a lock inside this method to avoid deadlocks since
   205  // the callers are expected to be holding a lock, and golang doesn't support reentrant locks.
   206  func (d *JobStore) getJob(id string) (model.Job, error) {
   207  	if len(id) < model.ShortIDLength {
   208  		return model.Job{}, bacerrors.NewJobNotFound(id)
   209  	}
   210  
   211  	// support for short job IDs
   212  	if jobutils.ShortID(id) == id {
   213  		// passed in a short id, need to resolve the long id first
   214  		for k := range d.jobs {
   215  			if jobutils.ShortID(k) == id {
   216  				id = k
   217  				break
   218  			}
   219  		}
   220  	}
   221  
   222  	j, ok := d.jobs[id]
   223  	if !ok {
   224  		returnError := bacerrors.NewJobNotFound(id)
   225  		return model.Job{}, returnError
   226  	}
   227  
   228  	return j, nil
   229  }
   230  
   231  func (d *JobStore) UpdateJobState(_ context.Context, request jobstore.UpdateJobStateRequest) error {
   232  	d.mtx.Lock()
   233  	defer d.mtx.Unlock()
   234  
   235  	// get the existing job state
   236  	jobState, ok := d.states[request.JobID]
   237  	if !ok {
   238  		return jobstore.NewErrJobNotFound(request.JobID)
   239  	}
   240  
   241  	// check the expected state
   242  	if err := request.Condition.Validate(jobState); err != nil {
   243  		return err
   244  	}
   245  	if jobState.State.IsTerminal() {
   246  		return jobstore.NewErrJobAlreadyTerminal(request.JobID, jobState.State, request.NewState)
   247  	}
   248  
   249  	// update the job state
   250  	previousState := jobState.State
   251  	jobState.State = request.NewState
   252  	jobState.Version++
   253  	jobState.UpdateTime = time.Now()
   254  	d.states[request.JobID] = jobState
   255  	if request.NewState.IsTerminal() {
   256  		delete(d.inprogress, request.JobID)
   257  	}
   258  	d.appendJobHistory(jobState, previousState, request.Comment)
   259  	return nil
   260  }
   261  
   262  func (d *JobStore) GetShardState(_ context.Context, shardID model.ShardID) (model.ShardState, error) {
   263  	d.mtx.RLock()
   264  	defer d.mtx.RUnlock()
   265  	jobState, ok := d.states[shardID.JobID]
   266  	if !ok {
   267  		return model.ShardState{}, jobstore.NewErrJobNotFound(shardID.JobID)
   268  	}
   269  	shardState, ok := jobState.Shards[shardID.Index]
   270  	if !ok {
   271  		return model.ShardState{}, jobstore.NewErrShardNotFound(shardID)
   272  	}
   273  	return shardState, nil
   274  }
   275  
   276  func (d *JobStore) UpdateShardState(_ context.Context, request jobstore.UpdateShardStateRequest) error {
   277  	d.mtx.Lock()
   278  	defer d.mtx.Unlock()
   279  
   280  	// find the existing shard
   281  	jobState, ok := d.states[request.ShardID.JobID]
   282  	if !ok {
   283  		return jobstore.NewErrJobNotFound(request.ShardID.JobID)
   284  	}
   285  	shardState, ok := jobState.Shards[request.ShardID.Index]
   286  	if !ok {
   287  		return jobstore.NewErrShardNotFound(request.ShardID)
   288  	}
   289  
   290  	// check the expected state
   291  	if err := request.Condition.Validate(shardState); err != nil {
   292  		return err
   293  	}
   294  	if shardState.State.IsTerminal() {
   295  		return jobstore.NewErrShardAlreadyTerminal(request.ShardID, shardState.State, request.NewState)
   296  	}
   297  
   298  	// update the shard state
   299  	previousState := shardState.State
   300  	shardState.State = request.NewState
   301  	shardState.Version++
   302  	shardState.UpdateTime = time.Now()
   303  	jobState.Shards[request.ShardID.Index] = shardState
   304  	d.states[request.ShardID.JobID] = jobState
   305  	d.appendShardHistory(shardState, previousState, request.Comment)
   306  	return nil
   307  }
   308  
   309  func (d *JobStore) CreateExecution(_ context.Context, execution model.ExecutionState) error {
   310  	d.mtx.Lock()
   311  	defer d.mtx.Unlock()
   312  	jobState, ok := d.states[execution.JobID]
   313  	if !ok {
   314  		return jobstore.NewErrJobNotFound(execution.JobID)
   315  	}
   316  	shardState, ok := jobState.Shards[execution.ShardIndex]
   317  	if !ok {
   318  		return jobstore.NewErrShardNotFound(execution.ShardID())
   319  	}
   320  	for _, e := range shardState.Executions {
   321  		if e.ID() == execution.ID() {
   322  			return jobstore.NewErrExecutionAlreadyExists(execution.ID())
   323  		}
   324  	}
   325  	if execution.CreateTime.IsZero() {
   326  		execution.CreateTime = time.Now()
   327  	}
   328  	if execution.UpdateTime.IsZero() {
   329  		execution.UpdateTime = execution.CreateTime
   330  	}
   331  	if execution.Version == 0 {
   332  		execution.Version = 1
   333  	}
   334  	shardState.Executions = append(shardState.Executions, execution)
   335  	jobState.Shards[execution.ShardIndex] = shardState
   336  	d.states[execution.JobID] = jobState
   337  	d.appendExecutionHistory(execution, model.ExecutionStateNew, "")
   338  	return nil
   339  }
   340  
   341  func (d *JobStore) UpdateExecution(_ context.Context, request jobstore.UpdateExecutionRequest) error {
   342  	d.mtx.Lock()
   343  	defer d.mtx.Unlock()
   344  
   345  	// find the existing execution
   346  	jobState, ok := d.states[request.ExecutionID.JobID]
   347  	if !ok {
   348  		return jobstore.NewErrJobNotFound(request.ExecutionID.JobID)
   349  	}
   350  	shardState, ok := jobState.Shards[request.ExecutionID.ShardIndex]
   351  	if !ok {
   352  		return jobstore.NewErrShardNotFound(request.ExecutionID.ShardID())
   353  	}
   354  	var existingExecution model.ExecutionState
   355  	executionIndex := -1
   356  	for i, e := range shardState.Executions {
   357  		if e.ID() == request.ExecutionID {
   358  			existingExecution = e
   359  			executionIndex = i
   360  			break
   361  		}
   362  	}
   363  	if executionIndex == -1 {
   364  		return jobstore.NewErrExecutionNotFound(request.ExecutionID)
   365  	}
   366  
   367  	// check the expected state
   368  	if err := request.Condition.Validate(existingExecution); err != nil {
   369  		return err
   370  	}
   371  	if existingExecution.State.IsTerminal() {
   372  		return jobstore.NewErrExecutionAlreadyTerminal(request.ExecutionID, existingExecution.State, request.NewValues.State)
   373  	}
   374  
   375  	// populate default values
   376  	newExecution := request.NewValues
   377  	if newExecution.CreateTime.IsZero() {
   378  		newExecution.CreateTime = time.Now()
   379  	}
   380  	if newExecution.UpdateTime.IsZero() {
   381  		newExecution.UpdateTime = existingExecution.CreateTime
   382  	}
   383  	if newExecution.Version == 0 {
   384  		newExecution.Version = existingExecution.Version + 1
   385  	}
   386  
   387  	err := mergo.Merge(&newExecution, existingExecution)
   388  	if err != nil {
   389  		return err
   390  	}
   391  
   392  	// update the execution
   393  	previousState := existingExecution.State
   394  	shardState.Executions[executionIndex] = newExecution
   395  	jobState.Shards[newExecution.ShardIndex] = shardState
   396  	d.states[newExecution.JobID] = jobState
   397  	d.appendExecutionHistory(newExecution, previousState, request.Comment)
   398  	return nil
   399  }
   400  
   401  func (d *JobStore) appendJobHistory(updateJob model.JobState, previousState model.JobStateType, comment string) {
   402  	historyEntry := model.JobHistory{
   403  		Type:          model.JobHistoryTypeJobLevel,
   404  		JobID:         updateJob.JobID,
   405  		PreviousState: previousState.String(),
   406  		NewState:      updateJob.State.String(),
   407  		NewVersion:    updateJob.Version,
   408  		Comment:       comment,
   409  		Time:          updateJob.UpdateTime,
   410  	}
   411  	d.history[updateJob.JobID] = append(d.history[updateJob.JobID], historyEntry)
   412  }
   413  
   414  func (d *JobStore) appendShardHistory(updatedShard model.ShardState, previousState model.ShardStateType, comment string) {
   415  	historyEntry := model.JobHistory{
   416  		Type:          model.JobHistoryTypeShardLevel,
   417  		JobID:         updatedShard.JobID,
   418  		ShardIndex:    updatedShard.ShardIndex,
   419  		PreviousState: previousState.String(),
   420  		NewState:      updatedShard.State.String(),
   421  		NewVersion:    updatedShard.Version,
   422  		Comment:       comment,
   423  		Time:          updatedShard.UpdateTime,
   424  	}
   425  	d.history[updatedShard.JobID] = append(d.history[updatedShard.JobID], historyEntry)
   426  }
   427  
   428  func (d *JobStore) appendExecutionHistory(updatedExecution model.ExecutionState, previousState model.ExecutionStateType, comment string) {
   429  	historyEntry := model.JobHistory{
   430  		Type:             model.JobHistoryTypeExecutionLevel,
   431  		JobID:            updatedExecution.JobID,
   432  		ShardIndex:       updatedExecution.ShardIndex,
   433  		NodeID:           updatedExecution.NodeID,
   434  		ComputeReference: updatedExecution.ComputeReference,
   435  		PreviousState:    previousState.String(),
   436  		NewState:         updatedExecution.State.String(),
   437  		NewStateType:     updatedExecution.State,
   438  		NewVersion:       updatedExecution.Version,
   439  		Comment:          comment,
   440  		Time:             updatedExecution.UpdateTime,
   441  	}
   442  	d.history[updatedExecution.JobID] = append(d.history[updatedExecution.JobID], historyEntry)
   443  }
   444  
   445  // Static check to ensure that Transport implements Transport:
   446  var _ jobstore.Store = (*JobStore)(nil)