github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/jobmaster/dm/metadata/job.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package metadata
    15  
    16  import (
    17  	"context"
    18  	"encoding/json"
    19  	"fmt"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/pingcap/tiflow/engine/jobmaster/dm/bootstrap"
    24  	"github.com/pingcap/tiflow/engine/jobmaster/dm/config"
    25  	"github.com/pingcap/tiflow/engine/pkg/adapter"
    26  	metaModel "github.com/pingcap/tiflow/engine/pkg/meta/model"
    27  	"github.com/pingcap/tiflow/pkg/errors"
    28  	"go.uber.org/zap"
    29  )
    30  
    31  // TaskStage represents internal stage of a task.
    32  // TODO: use Stage in lib or move Stage to lib.
    33  // we need to use same value for stage with same name in dmpb.Stage in order to make grafana dashboard label correct,
    34  // since we use the same grafana dashboard for OP and engine.
    35  // there's no need for them to have same meaning, just for grafana display.
    36  type TaskStage int
    37  
    38  // These stages may be updated in later pr.
    39  const (
    40  	StageInit     TaskStage = iota + 1  // = 1 = dmpb.Stage_New
    41  	StageRunning                        // = 2 = dmpb.Stage_Running
    42  	StagePaused                         // = 3 ~= dmpb.Stage_Paused. in engine this stage means paused by user, if it's auto-paused by error, it's StageError
    43  	StageFinished TaskStage = iota + 2  // = 5 = dmpb.Stage_Finished. skip 4 - Stopped, no such stage in engine, see dm/worker/metrics.go
    44  	StagePausing                        // = 6 = dmpb.Stage_Pausing
    45  	StageError    TaskStage = iota + 10 // = 15, leave some value space for extension of dmpb.Stage
    46  	// StageUnscheduled means the task is not scheduled.
    47  	// This usually happens when the worker is offline.
    48  	StageUnscheduled
    49  )
    50  
    51  var typesStringify = [...]string{
    52  	0:                "",
    53  	StageInit:        "Initing",
    54  	StageRunning:     "Running",
    55  	StagePaused:      "Paused",
    56  	StageFinished:    "Finished",
    57  	StageError:       "Error",
    58  	StagePausing:     "Pausing",
    59  	StageUnscheduled: "Unscheduled",
    60  }
    61  
    62  var toTaskStage map[string]TaskStage
    63  
    64  func init() {
    65  	toTaskStage = make(map[string]TaskStage, len(typesStringify))
    66  	toTaskStage[""] = TaskStage(0)
    67  	for i, s := range typesStringify {
    68  		if len(s) == 0 {
    69  			continue
    70  		}
    71  		toTaskStage[s] = TaskStage(i)
    72  	}
    73  }
    74  
    75  // String implements fmt.Stringer interface
    76  func (ts TaskStage) String() string {
    77  	if int(ts) >= len(typesStringify) || ts < 0 {
    78  		return fmt.Sprintf("Unknown TaskStage %d", ts)
    79  	}
    80  	return typesStringify[ts]
    81  }
    82  
    83  // MarshalJSON marshals the enum as a quoted json string
    84  func (ts TaskStage) MarshalJSON() ([]byte, error) {
    85  	return json.Marshal(ts.String())
    86  }
    87  
    88  // UnmarshalJSON unmashals a quoted json string to the enum value
    89  func (ts *TaskStage) UnmarshalJSON(b []byte) error {
    90  	var (
    91  		j  string
    92  		ok bool
    93  	)
    94  	if err := json.Unmarshal(b, &j); err != nil {
    95  		return err
    96  	}
    97  	*ts, ok = toTaskStage[j]
    98  	if !ok {
    99  		return errors.Errorf("Unknown TaskStage %s", j)
   100  	}
   101  	return nil
   102  }
   103  
   104  // Job represents the state of a job.
   105  type Job struct {
   106  	// taskID -> task
   107  	Tasks map[string]*Task
   108  
   109  	// Deleting represents whether the job is being deleted.
   110  	Deleting bool
   111  }
   112  
   113  // NewJob creates a new Job instance
   114  func NewJob(jobCfg *config.JobCfg) *Job {
   115  	taskCfgs := jobCfg.ToTaskCfgs()
   116  	job := &Job{
   117  		Tasks: make(map[string]*Task, len(taskCfgs)),
   118  	}
   119  
   120  	for taskID, taskCfg := range taskCfgs {
   121  		job.Tasks[taskID] = NewTask(taskCfg)
   122  	}
   123  	return job
   124  }
   125  
   126  // Task is the minimum working unit of a job.
   127  // A job may contain multiple upstream and it will be converted into multiple tasks.
   128  type Task struct {
   129  	Cfg              *config.TaskCfg
   130  	Stage            TaskStage
   131  	StageUpdatedTime time.Time
   132  }
   133  
   134  // NewTask creates a new Task instance
   135  func NewTask(taskCfg *config.TaskCfg) *Task {
   136  	return &Task{
   137  		Cfg:              taskCfg,
   138  		Stage:            StageRunning, // TODO: support set stage when create task.
   139  		StageUpdatedTime: time.Now(),
   140  	}
   141  }
   142  
   143  // JobStore manages the state of a job.
   144  type JobStore struct {
   145  	*frameworkMetaStore
   146  	*bootstrap.DefaultUpgrader
   147  
   148  	mu     sync.Mutex
   149  	logger *zap.Logger
   150  }
   151  
   152  // NewJobStore creates a new JobStore instance
   153  func NewJobStore(kvClient metaModel.KVClient, pLogger *zap.Logger) *JobStore {
   154  	logger := pLogger.With(zap.String("component", "job_store"))
   155  	jobStore := &JobStore{
   156  		frameworkMetaStore: newTOMLFrameworkMetaStore(kvClient),
   157  		DefaultUpgrader:    bootstrap.NewDefaultUpgrader(logger),
   158  		logger:             logger,
   159  	}
   160  	jobStore.frameworkMetaStore.stateFactory = jobStore
   161  	jobStore.DefaultUpgrader.Upgrader = jobStore
   162  	return jobStore
   163  }
   164  
   165  // CreateState returns an empty Job object
   166  func (jobStore *JobStore) createState() state {
   167  	return &Job{}
   168  }
   169  
   170  // Key returns encoded key for job store
   171  func (jobStore *JobStore) key() string {
   172  	return adapter.DMJobKeyAdapter.Encode()
   173  }
   174  
   175  // UpdateStages will be called if user operate job.
   176  func (jobStore *JobStore) UpdateStages(ctx context.Context, taskIDs []string, stage TaskStage) error {
   177  	jobStore.mu.Lock()
   178  	defer jobStore.mu.Unlock()
   179  	state, err := jobStore.Get(ctx)
   180  	if err != nil {
   181  		return errors.Trace(err)
   182  	}
   183  
   184  	job := state.(*Job)
   185  	if job.Deleting {
   186  		return errors.New("failed to update stages because job is being deleted")
   187  	}
   188  	if len(taskIDs) == 0 {
   189  		for task := range job.Tasks {
   190  			taskIDs = append(taskIDs, task)
   191  		}
   192  	}
   193  	for _, taskID := range taskIDs {
   194  		t, ok := job.Tasks[taskID]
   195  		if !ok {
   196  			return errors.Errorf("task %s not found", taskID)
   197  		}
   198  		t.Stage = stage
   199  		t.StageUpdatedTime = time.Now()
   200  	}
   201  
   202  	return jobStore.Put(ctx, job)
   203  }
   204  
   205  // UpdateConfig will be called if user update job config.
   206  func (jobStore *JobStore) UpdateConfig(ctx context.Context, jobCfg *config.JobCfg) error {
   207  	jobStore.mu.Lock()
   208  	defer jobStore.mu.Unlock()
   209  	state, err := jobStore.Get(ctx)
   210  	if err != nil {
   211  		return errors.Trace(err)
   212  	}
   213  	oldJob := state.(*Job)
   214  	if oldJob.Deleting {
   215  		return errors.New("failed to update config because job is being deleted")
   216  	}
   217  
   218  	// TODO: we may diff the config at task level in the future, that way different tasks will have different modify revisions.
   219  	// so that changing the configuration of one task will not affect other tasks.
   220  	var oldVersion uint64
   221  	for _, task := range oldJob.Tasks {
   222  		oldVersion = task.Cfg.ModRevision
   223  		break
   224  	}
   225  	jobCfg.ModRevision = oldVersion + 1
   226  	newJob := NewJob(jobCfg)
   227  
   228  	for taskID, newTask := range newJob.Tasks {
   229  		// task stage will not be updated.
   230  		if oldTask, ok := oldJob.Tasks[taskID]; ok {
   231  			newTask.Stage = oldTask.Stage
   232  			newTask.StageUpdatedTime = oldTask.StageUpdatedTime
   233  		}
   234  	}
   235  
   236  	return jobStore.Put(ctx, newJob)
   237  }
   238  
   239  // MarkDeleting marks the job as deleting.
   240  func (jobStore *JobStore) MarkDeleting(ctx context.Context) error {
   241  	jobStore.mu.Lock()
   242  	defer jobStore.mu.Unlock()
   243  	state, err := jobStore.Get(ctx)
   244  	if err != nil {
   245  		return errors.Trace(err)
   246  	}
   247  	job := state.(*Job)
   248  	job.Deleting = true
   249  	return jobStore.Put(ctx, job)
   250  }
   251  
   252  // UpgradeFuncs implement the Upgrader interface.
   253  func (jobStore *JobStore) UpgradeFuncs() []bootstrap.UpgradeFunc {
   254  	return nil
   255  }
   256  
   257  // GetJobCfg gets the job config.
   258  func (jobStore *JobStore) GetJobCfg(ctx context.Context) (*config.JobCfg, error) {
   259  	state, err := jobStore.Get(ctx)
   260  	if err != nil {
   261  		return nil, err
   262  	}
   263  	job := state.(*Job)
   264  	taskCfg := make([]*config.TaskCfg, 0, len(job.Tasks))
   265  	for _, task := range job.Tasks {
   266  		taskCfg = append(taskCfg, task.Cfg)
   267  	}
   268  	return config.FromTaskCfgs(taskCfg), nil
   269  }