go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/server/dsmapper/job.go (about) 1 // Copyright 2018 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package dsmapper 16 17 import ( 18 "context" 19 "time" 20 21 "google.golang.org/protobuf/types/known/timestamppb" 22 23 "go.chromium.org/luci/common/clock" 24 "go.chromium.org/luci/common/errors" 25 "go.chromium.org/luci/common/logging" 26 "go.chromium.org/luci/common/retry/transient" 27 "go.chromium.org/luci/gae/service/datastore" 28 29 "go.chromium.org/luci/server/dsmapper/dsmapperpb" 30 "go.chromium.org/luci/server/dsmapper/internal/splitter" 31 "go.chromium.org/luci/server/tq" 32 ) 33 34 // ErrNoSuchJob is returned by GetJob if there's no Job with requested ID. 35 var ErrNoSuchJob = errors.New("no such mapping job", tq.Fatal) 36 37 // Query is a representation of datastore queries supported by the 38 // mapper. 39 // 40 // A query defines a set of entities the mapper operates on. 41 // 42 // This struct can be embedded into entities as is. 43 type Query struct { 44 Kind string // entity kind to limit the query, "" for kindless 45 Ancestor *datastore.Key // entity group to limit the query to (or nil) 46 } 47 48 // ToDatastoreQuery returns corresponding datastore.Query. 49 func (q *Query) ToDatastoreQuery() *datastore.Query { 50 dq := datastore.NewQuery(q.Kind) 51 if q.Ancestor != nil { 52 dq = dq.Ancestor(q.Ancestor) 53 } 54 return dq 55 } 56 57 // JobConfig defines what a new mapping job should do. 58 // 59 // It should be supplied by the users of the mapper library. 60 type JobConfig struct { 61 Query Query // a query identifying a set of entities 62 Mapper ID // ID of a registered mapper to apply to entities 63 Params []byte // arbitrary user-provided data to pass to the mapper 64 ShardCount int // number of shards to split the key range into 65 PageSize int // how many entities to process at once in each shard 66 67 // Optional parameters below for fine tunning. They have reasonable defaults, 68 // and should generally be not touched. 69 70 // PagesPerTask is how many pages (each of PageSize entities) to process 71 // inside a TQ task. 72 // 73 // Default is unlimited: process until the deadline. 74 PagesPerTask int 75 76 // TaskDuration is how long to run a single mapping TQ task before 77 // checkpointing the state and launching the next mapping TQ task. 78 // 79 // Small values (e.g. 1 min) makes each processing TQ task relatively small, 80 // so it doesn't eat a lot of memory, or produces gigantic unreadable logs. 81 // It also makes TQ's "Pause queue" button more handy. 82 // 83 // Default is 1 min. 84 TaskDuration time.Duration 85 86 // TrackProgress enables calculating number of entities per shard before 87 // launching mappers, and using it to calculate completion ETA. 88 // 89 // May be VERY slow if processing large amount of entities. Slowness manifests 90 // as a delay between job's launch and it actual start of shards processing. 91 // 92 // Enable only if shards are relatively small (< 100K entities per shard). 93 TrackProgress bool 94 } 95 96 // Validate returns an error of the config is invalid. 97 // 98 // Mapper existence is not checked. 99 func (jc *JobConfig) Validate() error { 100 switch { 101 case jc.ShardCount < 1: 102 return errors.Reason("ShardCount should be >= 1, try 8").Err() 103 case jc.PageSize <= 0: 104 return errors.Reason("PageSize should be > 0, try 256").Err() 105 case jc.PagesPerTask < 0: 106 return errors.Reason("PagesPerTask should be >= 0, keep 0 for default").Err() 107 case jc.TaskDuration < 0: 108 return errors.Reason("TaskDuration should be >= 0, keep 0 for default").Err() 109 } 110 return nil 111 } 112 113 // JobID identifies a mapping job. 114 type JobID int64 115 116 // Job is datastore representation of a mapping job (either active or not). 117 // 118 // It is a root entity with autogenerated key. 119 // 120 // Use Controller and Job methods to work with jobs. Attempting to use datastore 121 // API directly results in an undefined behavior. 122 type Job struct { 123 _kind string `gae:"$kind,mapper.Job"` 124 _extra datastore.PropertyMap `gae:"-,extra"` 125 126 // ID is auto-generated unique identifier of the job. 127 ID JobID `gae:"$id"` 128 // Config is the configuration of this job. Doesn't change once set. 129 Config JobConfig `gae:",noindex"` 130 // State is used to track job's lifecycle, see the enum. 131 State dsmapperpb.State 132 // Created is when the job was created, FYI. 133 Created time.Time 134 // Updated is when the job was last touched, FYI. 135 Updated time.Time 136 } 137 138 // shardList is an entity with a list of shard IDs associated with a job. 139 // 140 // A "static" singleton child entity of Job. Stored separately to allow callers 141 // to skip loading (potentially huge) list of shards if they are not interested 142 // in it. 143 type shardList struct { 144 _kind string `gae:"$kind,mapper.ShardList"` 145 _id int64 `gae:"$id,1"` 146 _extra datastore.PropertyMap `gae:"-,extra"` 147 148 Parent *datastore.Key `gae:"$parent"` 149 Shards []int64 `gae:",noindex"` 150 } 151 152 // fetchShardIDs fetches IDs of the job shards. 153 func (j *Job) fetchShardIDs(ctx context.Context) ([]int64, error) { 154 l := shardList{Parent: datastore.KeyForObj(ctx, j)} 155 switch err := datastore.Get(ctx, &l); { 156 case err == datastore.ErrNoSuchEntity: 157 return nil, errors.Annotate(err, "broken state, no ShardList entity for job %d", j.ID).Tag(tq.Fatal).Err() 158 case err != nil: 159 return nil, errors.Annotate(err, "when fetching list of shards of job %d", j.ID).Tag(transient.Tag).Err() 160 default: 161 return l.Shards, nil 162 } 163 } 164 165 // fetchShards fetches all job shards. 166 func (j *Job) fetchShards(ctx context.Context) ([]shard, error) { 167 ids, err := j.fetchShardIDs(ctx) 168 if err != nil { 169 return nil, err 170 } 171 172 shards := make([]shard, len(ids)) 173 for idx, sid := range ids { 174 shards[idx].ID = sid 175 } 176 177 if err := datastore.Get(ctx, shards); err != nil { 178 return nil, errors.Annotate(err, "failed to fetch some shards of job %d", j.ID).Tag(transient.Tag).Err() 179 } 180 return shards, nil 181 } 182 183 // FetchInfo fetches information about the job (including all shards). 184 func (j *Job) FetchInfo(ctx context.Context) (*dsmapperpb.JobInfo, error) { 185 info := &dsmapperpb.JobInfo{ 186 Id: int64(j.ID), 187 State: j.State, 188 Created: timestamppb.New(j.Created), 189 Updated: timestamppb.New(j.Updated), 190 TotalEntities: -1, // assume unknown, will be replaced below if known 191 } 192 193 // Jobs in STARTING state have no shards yet. 194 if j.State == dsmapperpb.State_STARTING { 195 return info, nil 196 } 197 198 shards, err := j.fetchShards(ctx) 199 if err != nil { 200 return nil, err 201 } 202 203 haveProgress := true // false if at least one shard has unknown ETA 204 updated := j.Updated // will be max(Updated of each shard) 205 206 info.Shards = make([]*dsmapperpb.ShardInfo, len(shards)) 207 for i, s := range shards { 208 sh := s.info() 209 info.Shards[i] = sh 210 info.ProcessedEntities += sh.ProcessedEntities 211 if ts := sh.Updated.AsTime(); ts.After(updated) { 212 updated = ts 213 } 214 if sh.TotalEntities == -1 { 215 haveProgress = false 216 } 217 } 218 219 // Calculate the overall rate from scratch, do NOT sum rates of shards, 220 // since it will also sum estimation errors too (which can be wild). 221 info.Updated = timestamppb.New(updated) 222 if runtime := updated.Sub(j.Created); runtime > 0 { 223 info.EntitiesPerSec = float32(float64(info.ProcessedEntities) / runtime.Seconds()) 224 } 225 226 if haveProgress { 227 maxETA := time.Time{} 228 229 info.TotalEntities = 0 230 for _, s := range info.Shards { 231 info.TotalEntities += s.TotalEntities 232 if s.Eta != nil { 233 if ts := s.Eta.AsTime(); maxETA.IsZero() || ts.After(maxETA) { 234 maxETA = ts 235 } 236 } 237 } 238 239 // The job completes when its longest shard does. Shards do not pass work 240 // to each other. 241 if !maxETA.IsZero() { 242 info.Eta = timestamppb.New(maxETA) 243 } 244 } 245 246 return info, nil 247 } 248 249 // getJob fetches a Job entity. 250 // 251 // Recognizes and tags transient errors. 252 func getJob(ctx context.Context, id JobID) (*Job, error) { 253 job := &Job{ID: id} 254 switch err := datastore.Get(ctx, job); { 255 case err == datastore.ErrNoSuchEntity: 256 return nil, ErrNoSuchJob 257 case err != nil: 258 return nil, errors.Annotate(err, "transient datastore error").Tag(transient.Tag).Err() 259 default: 260 return job, nil 261 } 262 } 263 264 // getJobInState fetches a Job entity and checks its state. 265 // 266 // Returns: 267 // 268 // (*Job, nil) if the job is there and its state matches one of given states. 269 // (nil, nil) if the job is there, but in a different state. 270 // (nil, transient error) on datastore fetch errors. 271 // (nil, fatal error) if there's no such job at all. 272 func getJobInState(ctx context.Context, id JobID, states ...dsmapperpb.State) (*Job, error) { 273 job, err := getJob(ctx, id) 274 if err != nil { 275 return nil, errors.Reason("failed to fetch job with ID %d", id).Err() 276 } 277 for _, s := range states { 278 if job.State == s { 279 return job, nil 280 } 281 } 282 logging.Infof(ctx, "Skipping the job: its state is %s, expecting one of %q", job.State, states) 283 return nil, nil 284 } 285 286 // shard represents a key range being worked on by a single worker (Start, End]. 287 // 288 // Shard entities are written to when workers checkpoint progress or finish. 289 // They are read when calculating overall progress of the job. 290 // 291 // It is a root entity with autogenerated key. Shards are associated with jobs 292 // via ShardList entity (owned by Job, for Job -> [Shard] queries), and via 293 // JobID property (for Shard -> Job queries). They are purposefully not a part 294 // of Job entity group, to avoid exceeding O(1) entity group write limit. 295 type shard struct { 296 _kind string `gae:"$kind,mapper.Shard"` 297 _extra datastore.PropertyMap `gae:"-,extra"` 298 299 // ID is auto-generated unique identifier of the shard. 300 ID int64 `gae:"$id"` 301 // JobID is ID of a job that owns this shard. 302 JobID JobID 303 // Index is the index of the shard in the job's shards list. 304 Index int `gae:",noindex"` 305 // State is used to track shard's lifecycle, see the enum. 306 State dsmapperpb.State 307 // Error is an error message for failed shards. 308 Error string `gae:",noindex"` 309 // ProcessTaskNum is next expected ProcessShard task number. 310 ProcessTaskNum int64 `gae:",noindex"` 311 // Range is an entity key range covered by this shard. 312 Range splitter.Range `gae:",noindex"` 313 // ExpectedCount is expected number of entities in the shard, -1 if unknown. 314 ExpectedCount int64 `gae:",noindex"` 315 // ProcessedCount is number entities processed by the shard thus far. 316 ProcessedCount int64 `gae:",noindex"` 317 // ResumeFrom is the last processed key or nil if just starting. 318 ResumeFrom *datastore.Key `gae:",noindex"` 319 // Created is when the shard was created, FYI. 320 Created time.Time 321 // Updated is when the shard was last touched, FYI. 322 Updated time.Time 323 } 324 325 // info returns a proto message with information about the shard. 326 func (s *shard) info() *dsmapperpb.ShardInfo { 327 var rate float64 328 var eta *timestamppb.Timestamp 329 330 if runtime := s.Updated.Sub(s.Created); runtime > 0 { 331 rate = float64(s.ProcessedCount) / runtime.Seconds() 332 if s.ExpectedCount != -1 && rate > 0.0001 { 333 secs := float64(s.ExpectedCount) / rate 334 eta = timestamppb.New(s.Created.Add(time.Duration(float64(time.Second) * secs))) 335 } 336 } 337 338 return &dsmapperpb.ShardInfo{ 339 Index: int32(s.Index), 340 State: s.State, 341 Error: s.Error, 342 Created: timestamppb.New(s.Created), 343 Updated: timestamppb.New(s.Updated), 344 Eta: eta, // nil if unknown 345 ProcessedEntities: s.ProcessedCount, 346 TotalEntities: s.ExpectedCount, // -1 if unknown 347 EntitiesPerSec: float32(rate), // 0 if unknown 348 } 349 } 350 351 // getActiveShard returns shard entity with given ID if its still in active 352 // state and its ProcessTaskNum matches the given taskNum. 353 // 354 // Returns: 355 // 356 // (*shard, nil) if the shard is there and matches the criteria. 357 // (nil, nil) if the shard is there, but it doesn't match the criteria. 358 // (nil, transient error) on datastore fetch errors. 359 // (nil, fatal error) if there's no such shard at all. 360 func getActiveShard(ctx context.Context, shardID, taskNum int64) (*shard, error) { 361 sh := &shard{ID: shardID} 362 switch err := datastore.Get(ctx, sh); { 363 case err == datastore.ErrNoSuchEntity: 364 return nil, errors.Annotate(err, "no such shard, aborting").Tag(tq.Fatal).Err() // fatal, no retries 365 case err != nil: 366 return nil, errors.Annotate(err, "failed to fetch the shard").Tag(transient.Tag).Err() 367 case isFinalState(sh.State): 368 logging.Warningf(ctx, "The shard is finished already") 369 return nil, nil 370 case sh.ProcessTaskNum != taskNum: 371 logging.Warningf(ctx, "The task is stale (shard's task_num is %d, but task's is %d). Skipping it", sh.ProcessTaskNum, taskNum) 372 return nil, nil 373 default: 374 return sh, nil 375 } 376 } 377 378 // shardTxnCb examines and optionally mutates the shard. 379 // 380 // It returns (true, nil) to instruct shardTxn to store the shard, (false, nil) 381 // to skip storing, and (..., err) to return the error. 382 type shardTxnCb func(ctx context.Context, sh *shard) (save bool, err error) 383 384 // shardTxn fetches the shard and calls the callback to examine or mutate it. 385 // 386 // Silently skips finished shards. 387 func shardTxn(ctx context.Context, shardID int64, cb shardTxnCb) error { 388 return runTxn(ctx, func(ctx context.Context) error { 389 sh := shard{ID: shardID} 390 switch err := datastore.Get(ctx, &sh); { 391 case err == datastore.ErrNoSuchEntity: 392 return errors.Annotate(err, "when fetching shard %d", shardID).Tag(tq.Fatal).Err() 393 case err != nil: 394 return errors.Annotate(err, "when fetching shard %d", shardID).Tag(transient.Tag).Err() 395 case isFinalState(sh.State): 396 return nil // the shard is already marked as done 397 } 398 switch save, err := cb(ctx, &sh); { 399 case err != nil: 400 return err 401 case !save: 402 return nil 403 default: 404 sh.Updated = clock.Now(ctx).UTC() 405 return transient.Tag.Apply(datastore.Put(ctx, &sh)) 406 } 407 }) 408 }