github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dload/job.go (about) 1 // Package dload implements functionality to download resources into AIS cluster from external source. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package dload 6 7 import ( 8 "errors" 9 "fmt" 10 "path" 11 "strings" 12 "time" 13 14 "github.com/NVIDIA/aistore/api/apc" 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/atomic" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/cmn/debug" 19 "github.com/NVIDIA/aistore/cmn/nlog" 20 "github.com/NVIDIA/aistore/core" 21 "github.com/NVIDIA/aistore/core/meta" 22 "github.com/NVIDIA/aistore/nl" 23 ) 24 25 const ( 26 // Determines the size of single batch size generated in `genNext`. 27 downloadBatchSize = 10_000 28 ) 29 30 // interface guard 31 var ( 32 _ jobif = (*sliceDlJob)(nil) 33 _ jobif = (*backendDlJob)(nil) 34 _ jobif = (*rangeDlJob)(nil) 35 ) 36 37 type ( 38 dlObj struct { 39 objName string 40 link string 41 fromRemote bool 42 } 43 44 jobif interface { 45 ID() string 46 XactID() string 47 Bck() *cmn.Bck 48 Description() string 49 Timeout() time.Duration 50 ActiveStats() (*StatusResp, error) 51 String() string 52 Notif() core.Notif // notifications 53 AddNotif(n core.Notif, job jobif) 54 55 // If total length (size) of download job is not known, -1 should be returned. 56 Len() int 57 58 // Determines if it requires also syncing. 59 Sync() bool 60 61 // Checks if object name matches the request. 62 checkObj(objName string) bool 63 64 // genNext is supposed to fulfill the following protocol: 65 // `ok` is set to `true` if there is batch to process, `false` otherwise 66 genNext() (objs []dlObj, ok bool, err error) 67 68 // via tryAcquire and release 69 throttler() *throttler 70 71 // job cleanup 72 cleanup() 73 } 74 75 baseDlJob struct { 76 bck *meta.Bck 77 notif *NotifDownload 78 xdl *Xact 79 id string 80 description string 81 timeout time.Duration 82 throt throttler 83 } 84 85 sliceDlJob struct { 86 baseDlJob 87 objs []dlObj 88 current int 89 } 90 multiDlJob struct { 91 sliceDlJob 92 } 93 singleDlJob struct { 94 sliceDlJob 95 } 96 97 rangeDlJob struct { 98 baseDlJob 99 objs []dlObj // objects' metas which are ready to be downloaded 100 pt cos.ParsedTemplate // range template 101 dir string // objects directory(prefix) from request 102 count int // total number object to download by a target 103 done bool // true when iterator is finished, nothing left to read 104 } 105 106 backendDlJob struct { 107 baseDlJob 108 prefix string 109 suffix string 110 continuationToken string 111 objs []dlObj // objects' metas which are ready to be downloaded 112 sync bool 113 done bool 114 } 115 116 dljob struct { 117 id string 118 xid string 119 description string 120 startedTime time.Time 121 finishedTime atomic.Time 122 finishedCnt atomic.Int32 123 scheduledCnt atomic.Int32 124 skippedCnt atomic.Int32 125 errorCnt atomic.Int32 126 total int 127 aborted atomic.Bool 128 allDispatched atomic.Bool 129 } 130 ) 131 132 /////////////// 133 // baseDlJob // 134 /////////////// 135 136 func (j *baseDlJob) init(id string, bck *meta.Bck, timeout, desc string, limits Limits, xdl *Xact) { 137 // TODO: this might be inaccurate if we download 1 or 2 objects because then 138 // other targets will have limits but will not use them. 139 if limits.BytesPerHour > 0 { 140 limits.BytesPerHour /= core.T.Sowner().Get().CountActiveTs() 141 } 142 td, _ := time.ParseDuration(timeout) 143 { 144 j.id = id 145 j.bck = bck 146 j.timeout = td 147 j.description = desc 148 j.throt.init(limits) 149 j.xdl = xdl 150 } 151 } 152 153 func (j *baseDlJob) ID() string { return j.id } 154 func (j *baseDlJob) XactID() string { return j.xdl.ID() } 155 func (j *baseDlJob) Bck() *cmn.Bck { return j.bck.Bucket() } 156 func (j *baseDlJob) Timeout() time.Duration { return j.timeout } 157 func (j *baseDlJob) Description() string { return j.description } 158 func (*baseDlJob) Sync() bool { return false } 159 160 func (j *baseDlJob) String() (s string) { 161 s = fmt.Sprintf("dl-job[%s]-%s", j.ID(), j.Bck()) 162 if j.Description() == "" { 163 return 164 } 165 return s + "-" + j.Description() 166 } 167 168 func (j *baseDlJob) Notif() core.Notif { return j.notif } 169 170 func (j *baseDlJob) AddNotif(n core.Notif, job jobif) { 171 var ok bool 172 debug.Assert(j.notif == nil) // currently, "add" means "set" 173 j.notif, ok = n.(*NotifDownload) 174 debug.Assert(ok) 175 j.notif.job = job 176 debug.Assert(j.notif.F != nil) 177 if n.Upon(core.UponProgress) { 178 debug.Assert(j.notif.P != nil) 179 } 180 } 181 182 func (j *baseDlJob) ActiveStats() (*StatusResp, error) { 183 resp, _, err := j.xdl.JobStatus(j.ID(), true /*onlyActive*/) 184 if err != nil { 185 return nil, err 186 } 187 return resp.(*StatusResp), nil 188 } 189 190 func (*baseDlJob) checkObj(string) bool { debug.Assert(false); return false } 191 func (j *baseDlJob) throttler() *throttler { return &j.throt } 192 193 func (j *baseDlJob) cleanup() { 194 j.throttler().stop() 195 err, aborted := g.store.markFinished(j.ID()) 196 aborted = aborted || j.xdl.IsAborted() // TODO: assert equality 197 if err != nil { 198 nlog.Errorln(j.String()+":", err, aborted) 199 } 200 g.store.flush(j.ID()) 201 nl.OnFinished(j.Notif(), err, aborted) 202 } 203 204 // 205 // sliceDlJob -- multiDlJob -- singleDlJob 206 // 207 208 func (j *sliceDlJob) init(bck *meta.Bck, objects cos.StrKVs) error { 209 objs, err := buildDlObjs(bck, objects) 210 if err != nil { 211 return err 212 } 213 j.objs = objs 214 return nil 215 } 216 217 func (j *sliceDlJob) Len() int { return len(j.objs) } 218 219 func (j *sliceDlJob) genNext() (objs []dlObj, ok bool, err error) { 220 if j.current == len(j.objs) { 221 return nil, false, nil 222 } 223 if j.current+downloadBatchSize >= len(j.objs) { 224 objs = j.objs[j.current:] 225 j.current = len(j.objs) 226 return objs, true, nil 227 } 228 229 objs = j.objs[j.current : j.current+downloadBatchSize] 230 j.current += downloadBatchSize 231 return objs, true, nil 232 } 233 234 func newMultiDlJob(id string, bck *meta.Bck, payload *MultiBody, xdl *Xact) (mj *multiDlJob, err error) { 235 var objs cos.StrKVs 236 237 mj = &multiDlJob{} 238 mj.baseDlJob.init(id, bck, payload.Timeout, payload.Describe(), payload.Limits, xdl) 239 240 if objs, err = payload.ExtractPayload(); err != nil { 241 return nil, err 242 } 243 err = mj.sliceDlJob.init(bck, objs) 244 return 245 } 246 247 func (j *multiDlJob) String() (s string) { return "multi-" + j.baseDlJob.String() } 248 249 func newSingleDlJob(id string, bck *meta.Bck, payload *SingleBody, xdl *Xact) (sj *singleDlJob, err error) { 250 var objs cos.StrKVs 251 252 sj = &singleDlJob{} 253 sj.baseDlJob.init(id, bck, payload.Timeout, payload.Describe(), payload.Limits, xdl) 254 255 if objs, err = payload.ExtractPayload(); err != nil { 256 return nil, err 257 } 258 err = sj.sliceDlJob.init(bck, objs) 259 return 260 } 261 262 func (j *singleDlJob) String() (s string) { 263 return "single-" + j.baseDlJob.String() 264 } 265 266 //////////////// 267 // rangeDlJob // 268 //////////////// 269 270 // NOTE: the sizes of objects to be downloaded will be unknown. 271 func newRangeDlJob(id string, bck *meta.Bck, payload *RangeBody, xdl *Xact) (rj *rangeDlJob, err error) { 272 rj = &rangeDlJob{} 273 if rj.pt, err = cos.ParseBashTemplate(payload.Template); err != nil { 274 return nil, err 275 } 276 rj.baseDlJob.init(id, bck, payload.Timeout, payload.Describe(), payload.Limits, xdl) 277 278 if rj.count, err = countObjects(rj.pt, payload.Subdir, rj.bck); err != nil { 279 return nil, err 280 } 281 rj.pt.InitIter() 282 rj.dir = payload.Subdir 283 return 284 } 285 286 func (j *rangeDlJob) SrcBck() *cmn.Bck { return j.bck.Bucket() } 287 func (j *rangeDlJob) Len() int { return j.count } 288 289 func (j *rangeDlJob) genNext() ([]dlObj, bool, error) { 290 if j.done { 291 return nil, false, nil 292 } 293 if err := j.getNextObjs(); err != nil { 294 return nil, false, err 295 } 296 return j.objs, true, nil 297 } 298 299 func (j *rangeDlJob) String() (s string) { 300 return fmt.Sprintf("range-%s-%d-%s", &j.baseDlJob, j.count, j.dir) 301 } 302 303 func (j *rangeDlJob) getNextObjs() error { 304 var ( 305 smap = core.T.Sowner().Get() 306 sid = core.T.SID() 307 ) 308 j.objs = j.objs[:0] 309 for len(j.objs) < downloadBatchSize { 310 link, ok := j.pt.Next() 311 if !ok { 312 j.done = true 313 break 314 } 315 name := path.Join(j.dir, path.Base(link)) 316 obj, err := makeDlObj(smap, sid, j.bck, name, link) 317 if err != nil { 318 if err == errInvalidTarget { 319 continue 320 } 321 return err 322 } 323 j.objs = append(j.objs, obj) 324 } 325 return nil 326 } 327 328 ////////////////// 329 // backendDlJob // 330 ////////////////// 331 332 func newBackendDlJob(id string, bck *meta.Bck, payload *BackendBody, xdl *Xact) (bj *backendDlJob, err error) { 333 if !bck.IsRemote() { 334 return nil, errors.New("bucket download requires a remote bucket") 335 } else if bck.IsHTTP() { 336 return nil, errors.New("bucket download does not support HTTP buckets") 337 } 338 bj = &backendDlJob{} 339 bj.baseDlJob.init(id, bck, payload.Timeout, payload.Describe(), payload.Limits, xdl) 340 { 341 bj.sync = payload.Sync 342 bj.prefix = payload.Prefix 343 bj.suffix = payload.Suffix 344 } 345 return 346 } 347 348 func (*backendDlJob) Len() int { return -1 } 349 func (j *backendDlJob) Sync() bool { return j.sync } 350 351 func (j *backendDlJob) String() (s string) { 352 return fmt.Sprintf("backend-%s-%s-%s", &j.baseDlJob, j.prefix, j.suffix) 353 } 354 355 func (j *backendDlJob) checkObj(objName string) bool { 356 return strings.HasPrefix(objName, j.prefix) && strings.HasSuffix(objName, j.suffix) 357 } 358 359 func (j *backendDlJob) genNext() (objs []dlObj, ok bool, err error) { 360 if j.done { 361 return nil, false, nil 362 } 363 if err := j.getNextObjs(); err != nil { 364 return nil, false, err 365 } 366 return j.objs, true, nil 367 } 368 369 // Reads the content of a remote bucket page by page until any objects to 370 // download found or the bucket list is over. 371 func (j *backendDlJob) getNextObjs() error { 372 var ( 373 sid = core.T.SID() 374 smap = core.T.Sowner().Get() 375 backend = core.T.Backend(j.bck) 376 ) 377 j.objs = j.objs[:0] 378 for len(j.objs) < downloadBatchSize { 379 var ( 380 lst = &cmn.LsoRes{} 381 msg = &apc.LsoMsg{Prefix: j.prefix, ContinuationToken: j.continuationToken, PageSize: j.bck.MaxPageSize()} 382 ) 383 _, err := backend.ListObjects(j.bck, msg, lst) 384 if err != nil { 385 return err 386 } 387 j.continuationToken = lst.ContinuationToken 388 389 for _, entry := range lst.Entries { 390 if !j.checkObj(entry.Name) { 391 continue 392 } 393 obj, err := makeDlObj(smap, sid, j.bck, entry.Name, "") 394 if err != nil { 395 if err == errInvalidTarget { 396 continue 397 } 398 return err 399 } 400 j.objs = append(j.objs, obj) 401 } 402 if j.continuationToken == "" { 403 j.done = true 404 break 405 } 406 } 407 return nil 408 } 409 410 /////////// 411 // dljob // 412 /////////// 413 414 func (j *dljob) clone() Job { 415 return Job{ 416 ID: j.id, 417 XactID: j.xid, 418 Description: j.description, 419 FinishedCnt: int(j.finishedCnt.Load()), 420 ScheduledCnt: int(j.scheduledCnt.Load()), 421 SkippedCnt: int(j.skippedCnt.Load()), 422 ErrorCnt: int(j.errorCnt.Load()), 423 Total: j.total, 424 AllDispatched: j.allDispatched.Load(), 425 Aborted: j.aborted.Load(), 426 StartedTime: j.startedTime, 427 FinishedTime: j.finishedTime.Load(), 428 } 429 } 430 431 // Used for debugging purposes to ensure integrity of the struct. 432 func (j *dljob) valid() (err error) { 433 if j.aborted.Load() { 434 return 435 } 436 if !j.allDispatched.Load() { 437 return 438 } 439 if a, b, c := j.scheduledCnt.Load(), j.finishedCnt.Load(), j.errorCnt.Load(); a != b+c { 440 err = fmt.Errorf("invalid: %d != %d + %d", a, b, c) 441 } 442 return 443 }