go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/dsmapper/job.go (about)

     1  // Copyright 2018 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package dsmapper
    16  
    17  import (
    18  	"context"
    19  	"time"
    20  
    21  	"google.golang.org/protobuf/types/known/timestamppb"
    22  
    23  	"go.chromium.org/luci/common/clock"
    24  	"go.chromium.org/luci/common/errors"
    25  	"go.chromium.org/luci/common/logging"
    26  	"go.chromium.org/luci/common/retry/transient"
    27  	"go.chromium.org/luci/gae/service/datastore"
    28  
    29  	"go.chromium.org/luci/server/dsmapper/dsmapperpb"
    30  	"go.chromium.org/luci/server/dsmapper/internal/splitter"
    31  	"go.chromium.org/luci/server/tq"
    32  )
    33  
    34  // ErrNoSuchJob is returned by GetJob if there's no Job with requested ID.
    35  var ErrNoSuchJob = errors.New("no such mapping job", tq.Fatal)
    36  
    37  // Query is a representation of datastore queries supported by the
    38  // mapper.
    39  //
    40  // A query defines a set of entities the mapper operates on.
    41  //
    42  // This struct can be embedded into entities as is.
    43  type Query struct {
    44  	Kind     string         // entity kind to limit the query, "" for kindless
    45  	Ancestor *datastore.Key // entity group to limit the query to (or nil)
    46  }
    47  
    48  // ToDatastoreQuery returns corresponding datastore.Query.
    49  func (q *Query) ToDatastoreQuery() *datastore.Query {
    50  	dq := datastore.NewQuery(q.Kind)
    51  	if q.Ancestor != nil {
    52  		dq = dq.Ancestor(q.Ancestor)
    53  	}
    54  	return dq
    55  }
    56  
    57  // JobConfig defines what a new mapping job should do.
    58  //
    59  // It should be supplied by the users of the mapper library.
    60  type JobConfig struct {
    61  	Query      Query  // a query identifying a set of entities
    62  	Mapper     ID     // ID of a registered mapper to apply to entities
    63  	Params     []byte // arbitrary user-provided data to pass to the mapper
    64  	ShardCount int    // number of shards to split the key range into
    65  	PageSize   int    // how many entities to process at once in each shard
    66  
    67  	// Optional parameters below for fine tunning. They have reasonable defaults,
    68  	// and should generally be not touched.
    69  
    70  	// PagesPerTask is how many pages (each of PageSize entities) to process
    71  	// inside a TQ task.
    72  	//
    73  	// Default is unlimited: process until the deadline.
    74  	PagesPerTask int
    75  
    76  	// TaskDuration is how long to run a single mapping TQ task before
    77  	// checkpointing the state and launching the next mapping TQ task.
    78  	//
    79  	// Small values (e.g. 1 min) makes each processing TQ task relatively small,
    80  	// so it doesn't eat a lot of memory, or produces gigantic unreadable logs.
    81  	// It also makes TQ's "Pause queue" button more handy.
    82  	//
    83  	// Default is 1 min.
    84  	TaskDuration time.Duration
    85  
    86  	// TrackProgress enables calculating number of entities per shard before
    87  	// launching mappers, and using it to calculate completion ETA.
    88  	//
    89  	// May be VERY slow if processing large amount of entities. Slowness manifests
    90  	// as a delay between job's launch and it actual start of shards processing.
    91  	//
    92  	// Enable only if shards are relatively small (< 100K entities per shard).
    93  	TrackProgress bool
    94  }
    95  
    96  // Validate returns an error of the config is invalid.
    97  //
    98  // Mapper existence is not checked.
    99  func (jc *JobConfig) Validate() error {
   100  	switch {
   101  	case jc.ShardCount < 1:
   102  		return errors.Reason("ShardCount should be >= 1, try 8").Err()
   103  	case jc.PageSize <= 0:
   104  		return errors.Reason("PageSize should be > 0, try 256").Err()
   105  	case jc.PagesPerTask < 0:
   106  		return errors.Reason("PagesPerTask should be >= 0, keep 0 for default").Err()
   107  	case jc.TaskDuration < 0:
   108  		return errors.Reason("TaskDuration should be >= 0, keep 0 for default").Err()
   109  	}
   110  	return nil
   111  }
   112  
   113  // JobID identifies a mapping job.
   114  type JobID int64
   115  
   116  // Job is datastore representation of a mapping job (either active or not).
   117  //
   118  // It is a root entity with autogenerated key.
   119  //
   120  // Use Controller and Job methods to work with jobs. Attempting to use datastore
   121  // API directly results in an undefined behavior.
   122  type Job struct {
   123  	_kind  string                `gae:"$kind,mapper.Job"`
   124  	_extra datastore.PropertyMap `gae:"-,extra"`
   125  
   126  	// ID is auto-generated unique identifier of the job.
   127  	ID JobID `gae:"$id"`
   128  	// Config is the configuration of this job. Doesn't change once set.
   129  	Config JobConfig `gae:",noindex"`
   130  	// State is used to track job's lifecycle, see the enum.
   131  	State dsmapperpb.State
   132  	// Created is when the job was created, FYI.
   133  	Created time.Time
   134  	// Updated is when the job was last touched, FYI.
   135  	Updated time.Time
   136  }
   137  
   138  // shardList is an entity with a list of shard IDs associated with a job.
   139  //
   140  // A "static" singleton child entity of Job. Stored separately to allow callers
   141  // to skip loading (potentially huge) list of shards if they are not interested
   142  // in it.
   143  type shardList struct {
   144  	_kind  string                `gae:"$kind,mapper.ShardList"`
   145  	_id    int64                 `gae:"$id,1"`
   146  	_extra datastore.PropertyMap `gae:"-,extra"`
   147  
   148  	Parent *datastore.Key `gae:"$parent"`
   149  	Shards []int64        `gae:",noindex"`
   150  }
   151  
   152  // fetchShardIDs fetches IDs of the job shards.
   153  func (j *Job) fetchShardIDs(ctx context.Context) ([]int64, error) {
   154  	l := shardList{Parent: datastore.KeyForObj(ctx, j)}
   155  	switch err := datastore.Get(ctx, &l); {
   156  	case err == datastore.ErrNoSuchEntity:
   157  		return nil, errors.Annotate(err, "broken state, no ShardList entity for job %d", j.ID).Tag(tq.Fatal).Err()
   158  	case err != nil:
   159  		return nil, errors.Annotate(err, "when fetching list of shards of job %d", j.ID).Tag(transient.Tag).Err()
   160  	default:
   161  		return l.Shards, nil
   162  	}
   163  }
   164  
   165  // fetchShards fetches all job shards.
   166  func (j *Job) fetchShards(ctx context.Context) ([]shard, error) {
   167  	ids, err := j.fetchShardIDs(ctx)
   168  	if err != nil {
   169  		return nil, err
   170  	}
   171  
   172  	shards := make([]shard, len(ids))
   173  	for idx, sid := range ids {
   174  		shards[idx].ID = sid
   175  	}
   176  
   177  	if err := datastore.Get(ctx, shards); err != nil {
   178  		return nil, errors.Annotate(err, "failed to fetch some shards of job %d", j.ID).Tag(transient.Tag).Err()
   179  	}
   180  	return shards, nil
   181  }
   182  
   183  // FetchInfo fetches information about the job (including all shards).
   184  func (j *Job) FetchInfo(ctx context.Context) (*dsmapperpb.JobInfo, error) {
   185  	info := &dsmapperpb.JobInfo{
   186  		Id:            int64(j.ID),
   187  		State:         j.State,
   188  		Created:       timestamppb.New(j.Created),
   189  		Updated:       timestamppb.New(j.Updated),
   190  		TotalEntities: -1, // assume unknown, will be replaced below if known
   191  	}
   192  
   193  	// Jobs in STARTING state have no shards yet.
   194  	if j.State == dsmapperpb.State_STARTING {
   195  		return info, nil
   196  	}
   197  
   198  	shards, err := j.fetchShards(ctx)
   199  	if err != nil {
   200  		return nil, err
   201  	}
   202  
   203  	haveProgress := true // false if at least one shard has unknown ETA
   204  	updated := j.Updated // will be max(Updated of each shard)
   205  
   206  	info.Shards = make([]*dsmapperpb.ShardInfo, len(shards))
   207  	for i, s := range shards {
   208  		sh := s.info()
   209  		info.Shards[i] = sh
   210  		info.ProcessedEntities += sh.ProcessedEntities
   211  		if ts := sh.Updated.AsTime(); ts.After(updated) {
   212  			updated = ts
   213  		}
   214  		if sh.TotalEntities == -1 {
   215  			haveProgress = false
   216  		}
   217  	}
   218  
   219  	// Calculate the overall rate from scratch, do NOT sum rates of shards,
   220  	// since it will also sum estimation errors too (which can be wild).
   221  	info.Updated = timestamppb.New(updated)
   222  	if runtime := updated.Sub(j.Created); runtime > 0 {
   223  		info.EntitiesPerSec = float32(float64(info.ProcessedEntities) / runtime.Seconds())
   224  	}
   225  
   226  	if haveProgress {
   227  		maxETA := time.Time{}
   228  
   229  		info.TotalEntities = 0
   230  		for _, s := range info.Shards {
   231  			info.TotalEntities += s.TotalEntities
   232  			if s.Eta != nil {
   233  				if ts := s.Eta.AsTime(); maxETA.IsZero() || ts.After(maxETA) {
   234  					maxETA = ts
   235  				}
   236  			}
   237  		}
   238  
   239  		// The job completes when its longest shard does. Shards do not pass work
   240  		// to each other.
   241  		if !maxETA.IsZero() {
   242  			info.Eta = timestamppb.New(maxETA)
   243  		}
   244  	}
   245  
   246  	return info, nil
   247  }
   248  
   249  // getJob fetches a Job entity.
   250  //
   251  // Recognizes and tags transient errors.
   252  func getJob(ctx context.Context, id JobID) (*Job, error) {
   253  	job := &Job{ID: id}
   254  	switch err := datastore.Get(ctx, job); {
   255  	case err == datastore.ErrNoSuchEntity:
   256  		return nil, ErrNoSuchJob
   257  	case err != nil:
   258  		return nil, errors.Annotate(err, "transient datastore error").Tag(transient.Tag).Err()
   259  	default:
   260  		return job, nil
   261  	}
   262  }
   263  
   264  // getJobInState fetches a Job entity and checks its state.
   265  //
   266  // Returns:
   267  //
   268  //	(*Job, nil) if the job is there and its state matches one of given states.
   269  //	(nil, nil) if the job is there, but in a different state.
   270  //	(nil, transient error) on datastore fetch errors.
   271  //	(nil, fatal error) if there's no such job at all.
   272  func getJobInState(ctx context.Context, id JobID, states ...dsmapperpb.State) (*Job, error) {
   273  	job, err := getJob(ctx, id)
   274  	if err != nil {
   275  		return nil, errors.Reason("failed to fetch job with ID %d", id).Err()
   276  	}
   277  	for _, s := range states {
   278  		if job.State == s {
   279  			return job, nil
   280  		}
   281  	}
   282  	logging.Infof(ctx, "Skipping the job: its state is %s, expecting one of %q", job.State, states)
   283  	return nil, nil
   284  }
   285  
   286  // shard represents a key range being worked on by a single worker (Start, End].
   287  //
   288  // Shard entities are written to when workers checkpoint progress or finish.
   289  // They are read when calculating overall progress of the job.
   290  //
   291  // It is a root entity with autogenerated key. Shards are associated with jobs
   292  // via ShardList entity (owned by Job, for Job -> [Shard] queries), and via
   293  // JobID property (for Shard -> Job queries). They are purposefully not a part
   294  // of Job entity group, to avoid exceeding O(1) entity group write limit.
   295  type shard struct {
   296  	_kind  string                `gae:"$kind,mapper.Shard"`
   297  	_extra datastore.PropertyMap `gae:"-,extra"`
   298  
   299  	// ID is auto-generated unique identifier of the shard.
   300  	ID int64 `gae:"$id"`
   301  	// JobID is ID of a job that owns this shard.
   302  	JobID JobID
   303  	// Index is the index of the shard in the job's shards list.
   304  	Index int `gae:",noindex"`
   305  	// State is used to track shard's lifecycle, see the enum.
   306  	State dsmapperpb.State
   307  	// Error is an error message for failed shards.
   308  	Error string `gae:",noindex"`
   309  	// ProcessTaskNum is next expected ProcessShard task number.
   310  	ProcessTaskNum int64 `gae:",noindex"`
   311  	// Range is an entity key range covered by this shard.
   312  	Range splitter.Range `gae:",noindex"`
   313  	// ExpectedCount is expected number of entities in the shard, -1 if unknown.
   314  	ExpectedCount int64 `gae:",noindex"`
   315  	// ProcessedCount is number entities processed by the shard thus far.
   316  	ProcessedCount int64 `gae:",noindex"`
   317  	// ResumeFrom is the last processed key or nil if just starting.
   318  	ResumeFrom *datastore.Key `gae:",noindex"`
   319  	// Created is when the shard was created, FYI.
   320  	Created time.Time
   321  	// Updated is when the shard was last touched, FYI.
   322  	Updated time.Time
   323  }
   324  
   325  // info returns a proto message with information about the shard.
   326  func (s *shard) info() *dsmapperpb.ShardInfo {
   327  	var rate float64
   328  	var eta *timestamppb.Timestamp
   329  
   330  	if runtime := s.Updated.Sub(s.Created); runtime > 0 {
   331  		rate = float64(s.ProcessedCount) / runtime.Seconds()
   332  		if s.ExpectedCount != -1 && rate > 0.0001 {
   333  			secs := float64(s.ExpectedCount) / rate
   334  			eta = timestamppb.New(s.Created.Add(time.Duration(float64(time.Second) * secs)))
   335  		}
   336  	}
   337  
   338  	return &dsmapperpb.ShardInfo{
   339  		Index:             int32(s.Index),
   340  		State:             s.State,
   341  		Error:             s.Error,
   342  		Created:           timestamppb.New(s.Created),
   343  		Updated:           timestamppb.New(s.Updated),
   344  		Eta:               eta, // nil if unknown
   345  		ProcessedEntities: s.ProcessedCount,
   346  		TotalEntities:     s.ExpectedCount, // -1 if unknown
   347  		EntitiesPerSec:    float32(rate),   // 0 if unknown
   348  	}
   349  }
   350  
   351  // getActiveShard returns shard entity with given ID if its still in active
   352  // state and its ProcessTaskNum matches the given taskNum.
   353  //
   354  // Returns:
   355  //
   356  //	(*shard, nil) if the shard is there and matches the criteria.
   357  //	(nil, nil) if the shard is there, but it doesn't match the criteria.
   358  //	(nil, transient error) on datastore fetch errors.
   359  //	(nil, fatal error) if there's no such shard at all.
   360  func getActiveShard(ctx context.Context, shardID, taskNum int64) (*shard, error) {
   361  	sh := &shard{ID: shardID}
   362  	switch err := datastore.Get(ctx, sh); {
   363  	case err == datastore.ErrNoSuchEntity:
   364  		return nil, errors.Annotate(err, "no such shard, aborting").Tag(tq.Fatal).Err() // fatal, no retries
   365  	case err != nil:
   366  		return nil, errors.Annotate(err, "failed to fetch the shard").Tag(transient.Tag).Err()
   367  	case isFinalState(sh.State):
   368  		logging.Warningf(ctx, "The shard is finished already")
   369  		return nil, nil
   370  	case sh.ProcessTaskNum != taskNum:
   371  		logging.Warningf(ctx, "The task is stale (shard's task_num is %d, but task's is %d). Skipping it", sh.ProcessTaskNum, taskNum)
   372  		return nil, nil
   373  	default:
   374  		return sh, nil
   375  	}
   376  }
   377  
   378  // shardTxnCb examines and optionally mutates the shard.
   379  //
   380  // It returns (true, nil) to instruct shardTxn to store the shard, (false, nil)
   381  // to skip storing, and (..., err) to return the error.
   382  type shardTxnCb func(ctx context.Context, sh *shard) (save bool, err error)
   383  
   384  // shardTxn fetches the shard and calls the callback to examine or mutate it.
   385  //
   386  // Silently skips finished shards.
   387  func shardTxn(ctx context.Context, shardID int64, cb shardTxnCb) error {
   388  	return runTxn(ctx, func(ctx context.Context) error {
   389  		sh := shard{ID: shardID}
   390  		switch err := datastore.Get(ctx, &sh); {
   391  		case err == datastore.ErrNoSuchEntity:
   392  			return errors.Annotate(err, "when fetching shard %d", shardID).Tag(tq.Fatal).Err()
   393  		case err != nil:
   394  			return errors.Annotate(err, "when fetching shard %d", shardID).Tag(transient.Tag).Err()
   395  		case isFinalState(sh.State):
   396  			return nil // the shard is already marked as done
   397  		}
   398  		switch save, err := cb(ctx, &sh); {
   399  		case err != nil:
   400  			return err
   401  		case !save:
   402  			return nil
   403  		default:
   404  			sh.Updated = clock.Now(ctx).UTC()
   405  			return transient.Tag.Apply(datastore.Put(ctx, &sh))
   406  		}
   407  	})
   408  }