github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dload/jogger.go (about) 1 // Package dload implements functionality to download resources into AIS cluster from external source. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package dload 6 7 import ( 8 "sync" 9 10 "github.com/NVIDIA/aistore/cmn" 11 "github.com/NVIDIA/aistore/cmn/cos" 12 "github.com/NVIDIA/aistore/cmn/nlog" 13 "github.com/NVIDIA/aistore/core" 14 ) 15 16 const queueChSize = 1000 17 18 type ( 19 queueEntry = map[string]struct{} 20 21 queue struct { 22 ch chan *singleTask // for pending downloads 23 m map[string]queueEntry // jobID -> set of request uid 24 mu sync.RWMutex 25 } 26 27 // Each jogger corresponds to an mpath. All types of download requests 28 // corresponding to the jogger's mpath are forwarded to the jogger. Joggers 29 // exist in the Downloader's jogger member variable, and run only when there 30 // are dlTasks. 31 jogger struct { 32 mpath string 33 terminateCh cos.StopCh // synchronizes termination 34 parent *dispatcher 35 q *queue 36 task *singleTask // currently running download task 37 mtx sync.Mutex 38 stopAgent bool 39 } 40 ) 41 42 func newJogger(d *dispatcher, mpath string) (j *jogger) { 43 j = &jogger{mpath: mpath, parent: d, q: newQueue()} 44 j.terminateCh.Init() 45 return 46 } 47 48 func (j *jogger) jog() { 49 for { 50 t := j.q.get() 51 if t == nil { 52 break 53 } 54 55 j.mtx.Lock() 56 // Check if the task exists to ensure that the job wasn't removed while 57 // we waited on the queue. We must do it under the jogger's lock to ensure that 58 // there is no race between aborting job and marking it as being handled. 59 if !j.taskExists(t) { 60 t.job.throttler().release() 61 j.mtx.Unlock() 62 continue 63 } 64 65 if j.stopAgent { 66 // Jogger has been stopped so we must mark task as failed. We do not 67 // `break` here because we want to drain the queue, otherwise some 68 // of the tasks may be in the queue and therefore the finished 69 // counter won't be correct. 70 t.job.throttler().release() 71 t.markFailed(internalErrorMsg) 72 j.mtx.Unlock() 73 continue 74 } 75 76 j.task = t 77 j.task.init() 78 j.mtx.Unlock() 79 80 // do 81 lom := core.AllocLOM(t.obj.objName) 82 t.download(lom) 83 84 // finish, cleanup 85 core.FreeLOM(lom) 86 t.cancel() 87 88 t.job.throttler().release() 89 90 j.mtx.Lock() 91 j.task.persist() 92 j.task = nil 93 j.mtx.Unlock() 94 if j.q.del(t) { 95 j.parent.xdl.DecPending() 96 } 97 } 98 99 j.q.cleanup() 100 j.terminateCh.Close() 101 } 102 103 // stop terminates the jogger and waits for it to finish. 104 func (j *jogger) stop() { 105 nlog.Infof("Stopping jogger for mpath: %s", j.mpath) 106 107 j.mtx.Lock() 108 j.stopAgent = true 109 if j.task != nil { 110 j.task.cancel() // Stops running task (cancels download). 111 } 112 j.mtx.Unlock() 113 j.q.close() 114 115 <-j.terminateCh.Listen() 116 } 117 118 // Returns channel which task should be put into. 119 func (j *jogger) putCh(t *singleTask) chan<- *singleTask { 120 j.q.mu.Lock() 121 ok, ch := j.q.putCh(t) 122 j.q.mu.Unlock() 123 if ok { 124 j.parent.xdl.IncPending() 125 } 126 return ch 127 } 128 129 func (j *jogger) getTask(jobID string) (task *singleTask) { 130 j.mtx.Lock() 131 if j.task != nil && j.task.jobID() == jobID { 132 task = j.task 133 } 134 j.mtx.Unlock() 135 return task 136 } 137 138 func (j *jogger) abortJob(id string) { 139 var task *singleTask 140 141 j.mtx.Lock() 142 143 j.q.mu.Lock() 144 cnt := j.q.removeJob(id) // remove from pending 145 j.q.mu.Unlock() 146 j.parent.xdl.SubPending(cnt) 147 148 if j.task != nil && j.task.jobID() == id { 149 task = j.task 150 // iff the task belongs to the specified job 151 j.task.cancel() 152 } 153 154 j.mtx.Unlock() 155 156 if task != nil && cmn.Rom.FastV(4, cos.SmoduleDload) /*verbose*/ { 157 nlog.Infof("%s: abort-job[%s, mpath=%s], task=%s", core.T.String(), id, j.mpath, j.task.String()) 158 } 159 } 160 161 func (j *jogger) taskExists(t *singleTask) (exists bool) { 162 j.q.mu.RLock() 163 exists = j.q.exists(t.jobID(), t.uid()) 164 j.q.mu.RUnlock() 165 return exists 166 } 167 168 // Returns true if there is any pending task for a given job (either running or in queue), 169 // false otherwise. 170 func (j *jogger) pending(id string) bool { 171 task := j.getTask(id) 172 return task != nil || j.q.pending(id) 173 } 174 175 func newQueue() *queue { 176 return &queue{ 177 ch: make(chan *singleTask, queueChSize), 178 m: make(map[string]queueEntry), 179 } 180 } 181 182 // PRECONDITION: `q.Lock()` must be taken. 183 func (q *queue) putCh(t *singleTask) (ok bool, ch chan<- *singleTask) { 184 if q.stopped() || q.exists(t.jobID(), t.uid()) { 185 // If task already exists or the queue was stopped we should just omit it 186 // hence return channel which immediately accepts and omits the task. 187 return false, make(chan *singleTask, 1) 188 } 189 q.putToSet(t.jobID(), t.uid()) 190 return true, q.ch 191 } 192 193 // get retrieves first task in the queue. 194 func (q *queue) get() (foundTask *singleTask) { 195 t, ok := <-q.ch 196 if !ok { 197 return nil 198 } 199 200 // NOTE: We do not delete task here but postpone it until the task 201 // has `Finished` to prevent situation where we put task which is 202 // being downloaded. 203 return t 204 } 205 206 func (q *queue) del(t *singleTask) bool { 207 q.mu.Lock() 208 deleted := q.removeFromSet(t.jobID(), t.uid()) 209 q.mu.Unlock() 210 return deleted 211 } 212 213 func (q *queue) cleanup() { 214 q.mu.Lock() 215 q.ch = nil 216 q.m = nil 217 q.mu.Unlock() 218 } 219 220 // PRECONDITION: `q.RLock()` must be taken. 221 func (q *queue) stopped() bool { 222 return q.m == nil || q.ch == nil 223 } 224 225 // PRECONDITION: `q.RLock()` must be taken. 226 func (q *queue) exists(jobID, requestUID string) bool { 227 jobM, ok := q.m[jobID] 228 if !ok { 229 return false 230 } 231 232 _, ok = jobM[requestUID] 233 return ok 234 } 235 236 func (q *queue) pending(jobID string) (exists bool) { 237 q.mu.RLock() 238 _, exists = q.m[jobID] 239 q.mu.RUnlock() 240 return exists 241 } 242 243 // PRECONDITION: `q.Lock()` must be taken. 244 func (q *queue) putToSet(jobID, requestUID string) { 245 if _, ok := q.m[jobID]; !ok { 246 q.m[jobID] = make(queueEntry) 247 } 248 q.m[jobID][requestUID] = struct{}{} 249 } 250 251 // PRECONDITION: `q.Lock()` must be taken. 252 func (q *queue) removeFromSet(jobID, requestUID string) (deleted bool) { 253 jobM, ok := q.m[jobID] 254 if !ok { 255 return false 256 } 257 258 if _, ok := jobM[requestUID]; ok { 259 delete(jobM, requestUID) 260 if len(jobM) == 0 { 261 delete(q.m, jobID) 262 } 263 return true 264 } 265 return false 266 } 267 268 // PRECONDITION: `q.Lock()` must be taken. 269 func (q *queue) removeJob(id string) int { 270 if q.stopped() { 271 return 0 272 } 273 jobM, ok := q.m[id] 274 if !ok { 275 return 0 276 } 277 delete(q.m, id) 278 return len(jobM) 279 } 280 281 func (q *queue) close() { 282 if q.ch != nil { 283 close(q.ch) 284 } 285 }